## Setup


In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import (
    PyPDFLoader,
    UnstructuredPDFLoader,
    PyPDFium2Loader,
)
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "faccionados"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents


In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

100%|██████████| 1/1 [00:12<00:00, 12.92s/it]

Number of chunks =  16
CONJUNTO VILLAGE CAMPESTRE II


JOSE FERNANDO DE OLIVEIRA LEITE
MANCHINHA
NI
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


EVERALDO ALEXANDRE DA SILVA FILHO
CHINA
NI
GERENTE



JARDINEIRA- JACINTINHO


CLEITON TIAGO SÉRGIO DA SILVA

PCC
PAIOL/GUARDA-ROUPAS



JARDINEIRA- JACINTINHO


GEOVANE BARBOSA SILVA

PCC
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


ALISSON DA SILVA RAMOS
GIBI
PCC
GERENTE



REGINALDO/FEITOSA


CAIUO GABRIEL DOS SANTOS SILVA
GORDINHO
PCC
SOLDADO DO TRÁFICO



FEITOSA


ANDERSON RODRIGO DA SILVA SOUZA
GRINGO
PCC
SOLDADO DO TRÁFICO



FEITOSA, CRUZEIRO DO SUL, ROSANE COLLOR, CLIMA BOM, BOM PARTO


ALEXANDRE VERÇOSA DOS SANTOS JUNIOR
PANELA
PCC
GERENTE



FEITOSA


BRENO GOMES DE OLIVEIRA
LIMA
PCC
SOLDADO DO TRÁFICO



VALE DO REGINALDO E FEITOSA


EDUARDO HENRIQUE GOMES DA SILVA

PCC
SOLDADO DO TRÁFICO



FEITOSA


JOSE MACIEL SILVA DE OLIVEIRA
IEL
PCC
SOLDADO DO TRÁFICO



FEITOSA


JOSE RUAN GABRIEL DE OLIVEIRA SILVA
NINHO DO PEIXOTO
PCC





## Create a dataframe of all the chunks


In [3]:
from helpers.df_helpers import documents2Dataframe

df = documents2Dataframe(pages)
print(df.shape)
df.head()

(16, 3)


Unnamed: 0,text,source,chunk_id
0,nome_completo\nvulgo_alcunha\nfaccao\nfaccao_f...,data_input/faccionados/faccionados_analise_vin...,96a8b2d34152432ea000b5ce4a25b38c
1,TATIANE DOS SANTOS FAUSTINO\n\nNI\nCHEFE/LÍDER...,data_input/faccionados/faccionados_analise_vin...,9c5b85e7cc6c442ba19d19f61ad61e0d
2,JEFFERSON THYAGO VIANA LEITE\nJUCA\nNI\nGERENT...,data_input/faccionados/faccionados_analise_vin...,4bfae368143f4de99121ad71f70c2b1c
3,CONJUNTO VILLAGE CAMPESTRE II\n\n\nJOSE FERNAN...,data_input/faccionados/faccionados_analise_vin...,03042d2ee7b245f8ba18759b719a24d0
4,LUCAS ANTÔNIO FARIAS LIMA\nLUKINHAS\nNI\nGEREN...,data_input/faccionados/faccionados_analise_vin...,58dd84ba065e4a669a033e6568b76c26


## Extract Concepts


In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again.

        dfne = dataframe of edges

        df = dataframe of chunks

Else the dataframes are read from the output directory


In [5]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = df2Graph(df, model="zephyr:latest")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)

    dfg1.to_csv(outputdirectory / "graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory / "chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory / "graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", "edge"], inplace=True)
dfg1["count"] = 4
## Increasing the weight of the relation to 4.
## We will assign the weight of 1 when later the contextual proximity will be calculated.
print(dfg1.shape)
dfg1.head()

[
  {
    "node_1": "nome_completo",
    "node_2": "uf_atual",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "José Mariano dos Santos",
    "node_2": "Rato",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "Givaldo Barbosa de França",
    "node_2": "Quinhão",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "José Eriovaldo da Paz",
    "node_2": "Índio",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "Lucas Manuel da Silva",
    "node_2": "Projeto",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "Victor Emanuel Oliveira dos Santos",
    "node_2": "Vulgo da Cruz",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "Gilvano Marcos Oliveira Santos",
    "node_2": "Gil",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "Derivaldo da Silva Santos",
    "node_2": "Gerente",
    "edge": "extrato de contexto"
  },
  {
    "node_1": "Felipe Vital dos Santos",
    "node_2": "CV",
    "edge": "extrato de contexto"
  },
  {
    "node_1"

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,nome_completo,uf_atual,extrato de contexto,96a8b2d34152432ea000b5ce4a25b38c,4
1,josé mariano dos santos,rato,extrato de contexto,96a8b2d34152432ea000b5ce4a25b38c,4
2,givaldo barbosa de frança,quinhão,extrato de contexto,96a8b2d34152432ea000b5ce4a25b38c,4
3,josé eriovaldo da paz,índio,extrato de contexto,96a8b2d34152432ea000b5ce4a25b38c,4
4,lucas manuel da silva,projeto,extrato de contexto,96a8b2d34152432ea000b5ce4a25b38c,4


## Calculating contextual proximity


In [6]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
8878,índio,cv,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",2,contextual proximity
8888,índio,josé emerson da silva,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",2,contextual proximity
8893,índio,lucilá toledo,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",2,contextual proximity
8897,índio,nem catenga / jovem,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",2,contextual proximity
8898,índio,ni,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",3,contextual proximity


### Merge both the dataframes


In [7]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ",".join, "count": "sum"})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,agnaldo josé da silva júnior,juninho ou simpatia,b5c3b284b53740aaa4e28c0356fbe412,Unknown relationship,4
1,alamedas da pajuçara,benedito bentes,"60635986bc76454da662e575417fce34,60635986bc764...",contextual proximity,3
2,alamedas da pajuçara,conjunto vale bentes 2,"60635986bc76454da662e575417fce34,60635986bc764...",contextual proximity,3
3,alamedas da pajuçara,josé wellington de souza,"60635986bc76454da662e575417fce34,60635986bc764...",contextual proximity,2
4,alamedas da pajuçara,lucas ronaldo vicente ferreira dos santos,"60635986bc76454da662e575417fce34,60635986bc764...",contextual proximity,7
...,...,...,...,...,...
3054,índio,cv,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",contextual proximity,2
3055,índio,josé emerson da silva,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",contextual proximity,2
3056,índio,lucilá toledo,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",contextual proximity,2
3057,índio,nem catenga / jovem,"96a8b2d34152432ea000b5ce4a25b38c,96a8b2d341524...",contextual proximity,2


## Calculate the NetworkX Graph


In [8]:
nodes = pd.concat([dfg["node_1"], dfg["node_2"]], axis=0).unique()
nodes.shape

(262,)

In [9]:
import networkx as nx

G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(str(node))

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row["count"] / 4,
    )

### Calculate communities for coloring the nodes


In [10]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  21
[['agnaldo josé da silva júnior', 'juninho ou simpatia'], ['alamedas da pajuçara', 'alexandre verçosa dos santos junior', 'alisson da silva ramos', 'alisson victor felix dos santos', 'alyx tarciano conceição pereira', 'anderson rodrigo da silva souza', 'arapiraca', 'baba', 'bairros manoel teles, senador nilo coelho e cacimbas', 'barrio do catete (planalto)', 'benedito bentes', 'benfica', 'bmw, henrique viadou', 'bozenga', 'brasil novo', 'breno gomes de oliveira', 'cabral/l7/l9', 'caiuo gabriel dos santos silva', 'capiatã (endereço comercial)', 'carlinho', 'carlos daniel de farías lima', 'carlos messias dos santos', 'chapa', 'chefe/líder', 'china', 'cleiton primeiro dos santos', 'cleiton tiago sérgio da silva', 'conjunto tavares granja', 'conjunto vale bentes 2', 'conjunto village campestre ii', 'coroa/aquila/negão/neno', 'cv', 'daniilo malta silva', 'das trevas', 'derivaldo da silva santos', 'eduardo daniel teixeira de moraes', 'eduardo henrique gomes da sil

### Create a dataframe for community colors


In [11]:
import seaborn as sns

palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,agnaldo josé da silva júnior,#579bdb,1
1,juninho ou simpatia,#579bdb,1
2,alamedas da pajuçara,#5775db,2
3,alexandre verçosa dos santos junior,#5775db,2
4,alisson da silva ramos,#5775db,2
...,...,...,...
257,uga ou huga,#9bdb57,19
258,melk ou china,#57db5f,20
259,melquisedeck estevão dos santos,#57db5f,20
260,olivaval rozenendo da silva filho,#d057db,21


### Add colors to the graph


In [12]:
for index, row in colors.iterrows():
    G.nodes[row["node"]]["group"] = row["group"]
    G.nodes[row["node"]]["color"] = row["color"]
    G.nodes[row["node"]]["size"] = G.degree[row["node"]]

In [13]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

# net.show(graph_output_directory, notebook=False)
net.show(graph_output_directory)