## Setup


In [40]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import (
    PyPDFLoader,
    UnstructuredPDFLoader,
    PyPDFium2Loader,
)
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "faccionados"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents


In [41]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

100%|██████████| 1/1 [00:00<00:00,  5.21it/s]

Number of chunks =  15
MARIA ESANIKESSIA PEREIRA DE SOUZA
CARMINHA
NI
CHEFE/LÍDER
NI
NI
NI
VILLAGE CAMPESTRE


CLAUDEMIR LUCAS PEIXOTO VENÂNCIO
CHACAL
NI
SOLDADO DO TRÁFICO
NI
NI
NI
POR TODO CLIMA BOM


RENATO EMANUEL DE MELO
PADRE
NI
GERENTE
NI
NI
NI
VILLAGE CAMPESTRE


ALEX BRUNO SOARES DE MENDONCA
LELECO
NI
CHEFE/LÍDER
NI
NI
NI
CONJUNTO VILLAGE CAMPESTRE II


JOSE FERNANDO DE OLIVEIRA LEITE
MANCHINHA
NI
SOLDADO DO TRÁFICO
NI
NI
NI
JARDINEIRA- JACINTINHO


EVERALDO ALEXANDRE DA SILVA FILHO
CHINA
NI
GERENTE
NI
NI
NI
JARDINEIRA- JACINTINHO


CLEITON TIAGO SÉRGIO DA SILVA
NI
PCC
PAIOL/GUARDA-ROUPAS
NI
NI
NI
JARDINEIRA- JACINTINHO


GEOVANE BARBOSA SILVA
NI
PCC
SOLDADO DO TRÁFICO
NI
NI
NI
JARDINEIRA- JACINTINHO


ALISSON DA SILVA RAMOS
GIBI
PCC
GERENTE
NI
NI
NI
REGINALDO/FEITOSA


CAIUO GABRIEL DOS SANTOS SILVA
GORDINHO
PCC
SOLDADO DO TRÁFICO
NI
NI
NI
FEITOSA


ANDERSON RODRIGO DA SILVA SOUZA
GRINGO
PCC
SOLDADO DO TRÁFICO
NI
NI
NI
FEITOSA, CRUZEIRO DO SUL, ROSANE COLLOR, CLIMA BOM, BOM P




## Create a dataframe of all the chunks


In [42]:
from helpers.df_helpers import documents2Dataframe

df = documents2Dataframe(pages)
print(df.shape)
df.head()

(15, 3)


Unnamed: 0,text,source,chunk_id
0,KAYO NASCIMENTO DE MAGALHÃES\nCABRA/ 99\nNI\nF...,data_input/faccionados/faccionados_analise_vin...,ecb2f8384b4d429b83cee2fbdc73fc24
1,MATHEUS SILVA CÂNDIDO DE MELO\nTETEU/TARTARUGA...,data_input/faccionados/faccionados_analise_vin...,41493bc9ab764848898b3f9634291ff6
2,MAXSUEL DA SILVA RODRIGUES SALVADOR\nNINHO DA ...,data_input/faccionados/faccionados_analise_vin...,768aea03e02f431b80b0f11b7bbfa91c
3,MARIA ESANIKESSIA PEREIRA DE SOUZA\nCARMINHA\n...,data_input/faccionados/faccionados_analise_vin...,6817d3a8b83b471e9c3637bd559799c1
4,CLEITON PRIMEIRO DOS SANTOS\nTUITA\nPCC\nSOLDA...,data_input/faccionados/faccionados_analise_vin...,0ed88d10ddf84859918a91c450fd8d8b


## Extract Concepts


In [43]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again.

        dfne = dataframe of edges

        df = dataframe of chunks

Else the dataframes are read from the output directory


In [44]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = df2Graph(df, model="zephyr:latest")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)

    dfg1.to_csv(outputdirectory / "graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory / "chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory / "graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", "edge"], inplace=True)
dfg1["count"] = 4
## Increasing the weight of the relation to 4.
## We will assign the weight of 1 when later the contextual proximity will be calculated.
print(dfg1.shape)
dfg1.head()

(249, 6)


Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,count
0,kayo nascimento de magalhães,maceió,located in,da00b0926d144bc284b3f8e259b368c6,,4
1,josé mariano dos santos,rato,has the role of,da00b0926d144bc284b3f8e259b368c6,,4
2,givaldo barbosa de frança,quinzinho,has the role of,da00b0926d144bc284b3f8e259b368c6,,4
3,josé erivaldo da paz,índio,is a part of,da00b0926d144bc284b3f8e259b368c6,,4
4,lucas manoel da silva,projeto,is involved in,da00b0926d144bc284b3f8e259b368c6,,4


## Calculating contextual proximity


In [45]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
7929,x-x-x-x,quilombo,"aa5e84c2f97e4aeeabaf3888bf6fbd97,aa5e84c2f97e4...",2,contextual proximity
7930,x-x-x-x,romário junio dos santos alves,"aa5e84c2f97e4aeeabaf3888bf6fbd97,aa5e84c2f97e4...",3,contextual proximity
7932,x-x-x-x,thairon maxuel ferreira da silva,"aa5e84c2f97e4aeeabaf3888bf6fbd97,aa5e84c2f97e4...",2,contextual proximity
7949,índio,maceió,"da00b0926d144bc284b3f8e259b368c6,da00b0926d144...",2,contextual proximity
7952,índio,piabas- jacintinho,"da00b0926d144bc284b3f8e259b368c6,da00b0926d144...",2,contextual proximity


### Merge both the dataframes


In [46]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ",".join, "count": "sum"})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,alex bruno soares de mendonça,conjunto village campeestre ii,56a85917a69b4803a17ab04eef02c1ae,resident of Conjunto Village Campeestre II,4
1,alex bruno soares de mendonça,feitosa,"56a85917a69b4803a17ab04eef02c1ae,56a85917a69b4...",contextual proximity,3
2,alex bruno soares de mendonça,jardineira- jacintinho,"56a85917a69b4803a17ab04eef02c1ae,56a85917a69b4...",contextual proximity,4
3,alex bruno soares de mendonça,village campeestre,"56a85917a69b4803a17ab04eef02c1ae,56a85917a69b4...",contextual proximity,2
4,alexandre luiz santos costa júnior,douglas vasconcelos dos santos,"82e359e6b9604bffbcc2b73fe7656a8b,82e359e6b9604...",contextual proximity,2
...,...,...,...,...,...
2660,x-x-x-x,quilombo,"aa5e84c2f97e4aeeabaf3888bf6fbd97,aa5e84c2f97e4...",contextual proximity,2
2661,x-x-x-x,romário junio dos santos alves,"aa5e84c2f97e4aeeabaf3888bf6fbd97,aa5e84c2f97e4...",contextual proximity,3
2662,x-x-x-x,thairon maxuel ferreira da silva,"aa5e84c2f97e4aeeabaf3888bf6fbd97,aa5e84c2f97e4...",contextual proximity,2
2663,índio,maceió,"da00b0926d144bc284b3f8e259b368c6,da00b0926d144...",contextual proximity,2


## Calculate the NetworkX Graph


In [47]:
nodes = pd.concat([dfg["node_1"], dfg["node_2"]], axis=0).unique()
nodes.shape

(279,)

In [48]:
import networkx as nx

G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(str(node))

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row["count"] / 4,
    )

### Calculate communities for coloring the nodes


In [49]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  16
[['alex bruno soares de mendonça', 'alexandre verçosa dos santos júnior', 'alisson da silva ramos', 'breno gomes de oliveira', 'caiuó gabriel dos santos silva', 'chacal', 'claudemir lucas peixoto venâncio', 'cleiton tiago sérgio da silva', 'conjunto village campeestre ii', 'eduardo henrique gomes da silva', 'everaldo alexandre da silva filho', 'feitosa', 'geovane barbosa silva', 'gordinho', 'jardineira- jacintinho', 'jose fernando de oliveira leite', 'jose maciel silva de oliveira', 'jose ruano gabriel de oliveira silva', 'maria esanikessía pereira de souza', 'ninho do peixoto', 'reginaldo/ feitosa', 'reginaldo/ feitosa, cruzeiro do sul, rosané collor, clima bom, bom parte', 'renato emanuel de melo', 'valle do reginaldo e feitosa', 'village campeestre'], ['alexandre luiz santos costa júnior', 'alexsandro dos santos', 'alto da barra', 'alto de são marcos', 'alyx tarciano conceição pereira', 'anderson rodrigo da silva souza', 'arabidalas rodrigues da silva', '

### Create a dataframe for community colors


In [50]:
import seaborn as sns

palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,alex bruno soares de mendonça,#a157db,1
1,alexandre verçosa dos santos júnior,#a157db,1
2,alisson da silva ramos,#a157db,1
3,breno gomes de oliveira,#a157db,1
4,caiuó gabriel dos santos silva,#a157db,1
...,...,...,...
274,randinho,#db5780,14
275,joze andré dos santos,#c3db57,15
276,pipinha,#c3db57,15
277,luán,#57a2db,16


### Add colors to the graph


In [51]:
for index, row in colors.iterrows():
    G.nodes[row["node"]]["group"] = row["group"]
    G.nodes[row["node"]]["color"] = row["color"]
    G.nodes[row["node"]]["size"] = G.degree[row["node"]]

In [52]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

# net.show(graph_output_directory, notebook=False)
net.show(graph_output_directory)