## Setup


In [14]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import (
    PyPDFLoader,
    UnstructuredPDFLoader,
    PyPDFium2Loader,
)
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "faccionados"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents


In [15]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.85it/s]

Number of chunks =  16
VILLAGE CAMPESTRE


49.0
CLAUDEMIR LUCAS PEIXOTO VENÂNCIO
CHACAL
NI
SOLDADO DO TRÁFICO



POR TODO CLIMA BOM


50.0
RENATO EMANUEL DE MELO
PADRE
NI
GERENTE



VILLAGE CAMPESTRE


51.0
ALEX BRUNO SOARES DE MENDONCA
LELECO
NI
CHEFE/LÍDER



CONJUNTO VILLAGE CAMPESTRE II


52.0
JOSE FERNANDO DE OLIVEIRA LEITE
MANCHINHA
NI
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


53.0
EVERALDO ALEXANDRE DA SILVA FILHO
CHINA
NI
GERENTE



JARDINEIRA- JACINTINHO


54.0
CLEITON TIAGO SÉRGIO DA SILVA

PCC
PAIOL/GUARDA-ROUPAS



JARDINEIRA- JACINTINHO


55.0
GEOVANE BARBOSA SILVA

PCC
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


56.0
ALISSON DA SILVA RAMOS
GIBI
PCC
GERENTE



REGINALDO/FEITOSA


57.0
CAIUO GABRIEL DOS SANTOS SILVA
GORDINHO
PCC
SOLDADO DO TRÁFICO



FEITOSA


58.0
ANDERSON RODRIGO DA SILVA SOUZA
GRINGO
PCC
SOLDADO DO TRÁFICO



FEITOSA, CRUZEIRO DO SUL, ROSANE COLLOR, CLIMA BOM, BOM PARTO


59.0
ALEXANDRE VERÇOSA DOS SANTOS JUNIOR
PANELA
PCC
GERENTE



FEITOSA


60




## Create a dataframe of all the chunks


In [16]:
from helpers.df_helpers import documents2Dataframe

df = documents2Dataframe(pages)
print(df.shape)
df.head()

(16, 3)


Unnamed: 0,text,source,chunk_id
0,nome_completo\nvulgo_alcunha\nfaccao\nfaccao_f...,data_input/faccionados/analises_vinculos.csv,23b93fb8db11465d9f0b2af07004bec1
1,LUCILA TOLEDO\n\n\n18.0\nMATHEUS SILVA CÂNDIDO...,data_input/faccionados/analises_vinculos.csv,52055b8646704e1d8099669c587262fc
2,31.0\nMAXSUEL DA SILVA RODRIGUES SALVADOR\nNIN...,data_input/faccionados/analises_vinculos.csv,858f7917671b46acb0501a207c1a395d
3,VILLAGE CAMPESTRE\n\n\n49.0\nCLAUDEMIR LUCAS P...,data_input/faccionados/analises_vinculos.csv,50ddd276885e4d81a8a480312ea58306
4,FEITOSA\n\n\n65.0\nALYX TARCIANO CONCEIÇÃO PER...,data_input/faccionados/analises_vinculos.csv,6c24571324e144d38c6f69a4cb082fcc


## Extract Concepts


In [17]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again.

        dfne = dataframe of edges

        df = dataframe of chunks

Else the dataframes are read from the output directory


In [18]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model="zephyr:latest")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)

    dfg1.to_csv(outputdirectory / "graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory / "chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory / "graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", "edge"], inplace=True)
dfg1["count"] = 4
## Increasing the weight of the relation to 4.
## We will assign the weight of 1 when later the contextual proximity will be calculated.
print(dfg1.shape)
dfg1.head()

[
   {
       "node_1": "nome_completo",
       "node_2": "cidade_atual",
       "edge": "cidadão ou residente de"
   },
   {
       "node_1": "nome_completo",
       "node_2": "uf_atual",
       "edge": "estado onde mora ou atua o indivíduo"
   },
   {
       "node_1": "nome_completo",
       "node_2": "bairro_atual",
       "edge": "bairro onde mora ou atua o indivíduo"
   },
   {
       "node_1": "nome_completo",
       "node_2": "area_atuacao",
       "edge": "atua ou se dedica a na área especificada"
   },
   {
       "node_1": "KAYO NASCIMENTO DE MAGALHÃES",
       "node_2": "nome_completo",
       "edge": "pessoa com este nome de batismo ou apelido"
   },
   {
       "node_1": "vulgo_alcunha",
       "node_2": "nome_completo",
       "edge": "outro nome com o qual a pessoa é conhecida"
   },
   {
       "node_1": "faccao_funcao",
       "node_2": "nome_completo",
       "edge": "função ou cargo que exerceu ou está exercendo a pessoa"
   },
   {
       "node_1": "vulgo_alcunha",


Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,count
0,nome_completo,cidade_atual,cidadão ou residente de,23b93fb8db11465d9f0b2af07004bec1,,4
1,nome_completo,uf_atual,estado onde mora ou atua o indivíduo,23b93fb8db11465d9f0b2af07004bec1,,4
2,nome_completo,bairro_atual,bairro onde mora ou atua o indivíduo,23b93fb8db11465d9f0b2af07004bec1,,4
3,nome_completo,area_atuacao,atua ou se dedica a na área especificada,23b93fb8db11465d9f0b2af07004bec1,,4
4,kayo nascimento de magalhães,nome_completo,pessoa com este nome de batismo ou apelido,23b93fb8db11465d9f0b2af07004bec1,,4


## Calculating contextual proximity


In [19]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(df,
                       id_vars=["chunk_id"],
                       value_vars=["node_1", "node_2"],
                       value_name="node")
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long,
                        dfg_long,
                        on="chunk_id",
                        suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (dfg2.groupby(["node_1", "node_2"]).agg({
        "chunk_id": [",".join, "count"]
    }).reset_index())
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
6354,zói,josé igor da conceição,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",2,contextual proximity
6357,zói,leanderson de araujo nunes,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",2,contextual proximity
6358,zói,luizinho,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",2,contextual proximity
6360,zói,ni,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",6,contextual proximity
6362,zói,rua estiva,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",3,contextual proximity


### Merge both the dataframes


In [20]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ",".join, "count": "sum"})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,abacaxi,alexsandro dos santos,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",contextual proximity,2
1,abacaxi,carrapicho,"c074b3bf1cc64d27878ecf9ab7c636b5,c074b3bf1cc64...",contextual proximity,2
2,abacaxi,"conj joaquim leão, vergel do lago","90be50b627504d7695beaeaa83b0374a,c074b3bf1cc64...",contextual proximity,2
3,abacaxi,gabriel gomes da silva,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",contextual proximity,2
4,abacaxi,jacaré,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",contextual proximity,2
...,...,...,...,...,...
1455,zói,josé igor da conceição,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",contextual proximity,2
1456,zói,leanderson de araujo nunes,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",contextual proximity,2
1457,zói,luizinho,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",contextual proximity,2
1458,zói,ni,"90be50b627504d7695beaeaa83b0374a,90be50b627504...",contextual proximity,6


## Calculate the NetworkX Graph


In [21]:
nodes = pd.concat([dfg["node_1"], dfg["node_2"]], axis=0).unique()
nodes.shape

(243,)

In [22]:
import networkx as nx

G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(str(node))

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row["count"] / 4,
    )

### Calculate communities for coloring the nodes


In [23]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  3
[['abacaxi', 'agnaldo angelo da silva', 'alan ângelo da silva', 'alexsandro dos santos', 'ana thalita da silva', 'anderson luan silva dos santos', 'andson barbosa de lima', 'arabidalas rodrigues da silva', 'baba rala', 'baixinho', 'barra de são miguel - alto da barra', 'barra nova', 'bidala', 'cabeleiro', 'cabelo', 'carlos alexandre araujo cirilo', 'carlos alexandre araújo cirilo', 'carlos daniel felix da silva', 'carlos eliwelton dos santos portela', 'carrapicho', 'charles miller galvão cavaleiro', 'cidade universitária', 'cledson ferreira dos santos', 'complexo do alemão, rio de janeiro, rj', 'conj erick ferraz', 'conj joaquim leão, vergel do lago', 'conjunto barnabé toledo - marechal deodoro', 'daniel cabral da silva', 'denison alexandre matos', 'denisson amorim', 'deyvid nilton coelho vargas', 'douglas santos da silva', 'efraim silva de oliveira', 'erick ferraz', 'erick ferraz - marechal deodoro', 'flexeiras, rio largo e messias.', 'fubinha', 'fusca', 'ga

### Create a dataframe for community colors


In [24]:
import seaborn as sns

palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,abacaxi,#57db5f,1
1,agnaldo angelo da silva,#57db5f,1
2,alan ângelo da silva,#57db5f,1
3,alexsandro dos santos,#57db5f,1
4,ana thalita da silva,#57db5f,1
...,...,...,...
238,romarinho,#db5f57,3
239,romário,#db5f57,3
240,romário junio dos santos alves,#db5f57,3
241,vendedor,#db5f57,3


### Add colors to the graph


In [25]:
for index, row in colors.iterrows():
    G.nodes[row["node"]]["group"] = row["group"]
    G.nodes[row["node"]]["color"] = row["color"]
    G.nodes[row["node"]]["size"] = G.degree[row["node"]]

In [26]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

# net.show(graph_output_directory, notebook=False)
net.show(graph_output_directory)