## Setup


In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import (
    PyPDFLoader,
    UnstructuredPDFLoader,
    PyPDFium2Loader,
)
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "faccionados"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents


In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

100%|██████████| 1/1 [00:01<00:00,  1.83s/it]

Number of chunks =  16
VILLAGE CAMPESTRE


49.0
CLAUDEMIR LUCAS PEIXOTO VENÂNCIO
CHACAL
NI
SOLDADO DO TRÁFICO



POR TODO CLIMA BOM


50.0
RENATO EMANUEL DE MELO
PADRE
NI
GERENTE



VILLAGE CAMPESTRE


51.0
ALEX BRUNO SOARES DE MENDONCA
LELECO
NI
CHEFE/LÍDER



CONJUNTO VILLAGE CAMPESTRE II


52.0
JOSE FERNANDO DE OLIVEIRA LEITE
MANCHINHA
NI
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


53.0
EVERALDO ALEXANDRE DA SILVA FILHO
CHINA
NI
GERENTE



JARDINEIRA- JACINTINHO


54.0
CLEITON TIAGO SÉRGIO DA SILVA

PCC
PAIOL/GUARDA-ROUPAS



JARDINEIRA- JACINTINHO


55.0
GEOVANE BARBOSA SILVA

PCC
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


56.0
ALISSON DA SILVA RAMOS
GIBI
PCC
GERENTE



REGINALDO/FEITOSA


57.0
CAIUO GABRIEL DOS SANTOS SILVA
GORDINHO
PCC
SOLDADO DO TRÁFICO



FEITOSA


58.0
ANDERSON RODRIGO DA SILVA SOUZA
GRINGO
PCC
SOLDADO DO TRÁFICO



FEITOSA, CRUZEIRO DO SUL, ROSANE COLLOR, CLIMA BOM, BOM PARTO


59.0
ALEXANDRE VERÇOSA DOS SANTOS JUNIOR
PANELA
PCC
GERENTE



FEITOSA


60




## Create a dataframe of all the chunks


In [3]:
from helpers.df_helpers import documents2Dataframe

df = documents2Dataframe(pages)
print(df.shape)
df.head()

(16, 3)


Unnamed: 0,text,source,chunk_id
0,nome_completo\nvulgo_alcunha\nfaccao\nfaccao_f...,data_input/faccionados/analises_vinculos.csv,a72ecaf44dc34a2ea6484148f17be246
1,LUCILA TOLEDO\n\n\n18.0\nMATHEUS SILVA CÂNDIDO...,data_input/faccionados/analises_vinculos.csv,9126cb0a6edf4023b88a191ada785201
2,31.0\nMAXSUEL DA SILVA RODRIGUES SALVADOR\nNIN...,data_input/faccionados/analises_vinculos.csv,56634a06c1f4473c8765e9aded8a76d9
3,VILLAGE CAMPESTRE\n\n\n49.0\nCLAUDEMIR LUCAS P...,data_input/faccionados/analises_vinculos.csv,0caccc0dd15a4620967b4d292ebc11da
4,FEITOSA\n\n\n65.0\nALYX TARCIANO CONCEIÇÃO PER...,data_input/faccionados/analises_vinculos.csv,be251b2f0e7242bf95af31592e711dfe


## Extract Concepts


In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again.

        dfne = dataframe of edges

        df = dataframe of chunks

Else the dataframes are read from the output directory


In [5]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = df2Graph(df, model="zephyr:latest")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)

    dfg1.to_csv(outputdirectory / "graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory / "chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory / "graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", "edge"], inplace=True)
dfg1["count"] = 4
## Increasing the weight of the relation to 4.
## We will assign the weight of 1 when later the contextual proximity will be calculated.
print(dfg1.shape)
dfg1.head()

[
  {
    "node_1": "nome_completo",
    "node_2": "vulgo_alcunha",
    "edge": "individuo conhecido por vários nomes ou apelidos"
  },
  {
    "node_1": "nome_completo",
    "node_2": "faccao",
    "edge": "cargo, função ou papel exercido por uma pessoa"
  },
  {
    "node_1": "nome_completo",
    "node_2": "faccao_funcao",
    "edge": "cargo ou função específico associado a uma organização ou empresa"
  },
  {
    "node_1": "nome_completo",
    "node_2": "bairro_atual",
    "edge": "localização atual do indivíduo"
  },
  {
    "node_1": "nome_completo",
    "node_2": "cidade_atual",
    "edge": "cidade ou local onde o indivíduo está residindo atualmente"
  },
  {
    "node_1": "nome_completo",
    "node_2": "uf_atual",
    "edge": "estado ou província no qual o indivíduo está residindo atualmente"
  },
  {
    "node_1": "nome_completo",
    "node_2": "area_atuacao",
    "edge": "área de atuação ou foco de interesse do indivíduo"
  },
  {
    "node_1": "KAYO NASCIMENTO DE MAGALHÃES",


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,nome_completo,vulgo_alcunha,individuo conhecido por vários nomes ou apelidos,a72ecaf44dc34a2ea6484148f17be246,4
1,nome_completo,faccao,"cargo, função ou papel exercido por uma pessoa",a72ecaf44dc34a2ea6484148f17be246,4
2,nome_completo,faccao_funcao,cargo ou função específico associado a uma org...,a72ecaf44dc34a2ea6484148f17be246,4
3,nome_completo,bairro_atual,localização atual do indivíduo,a72ecaf44dc34a2ea6484148f17be246,4
4,nome_completo,cidade_atual,cidade ou local onde o indivíduo está residind...,a72ecaf44dc34a2ea6484148f17be246,4


## Calculating contextual proximity


In [6]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(df,
                       id_vars=["chunk_id"],
                       value_vars=["node_1", "node_2"],
                       value_name="node")
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long,
                        dfg_long,
                        on="chunk_id",
                        suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (dfg2.groupby(["node_1", "node_2"]).agg({
        "chunk_id": [",".join, "count"]
    }).reset_index())
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
5081,williams josé da silva,soldado do tráfico,"de9cdd99683f45399c146dc5f3862206,de9cdd99683f4...",4,contextual proximity
5103,williamys dos santos almeida,soldado do tráfico,"de9cdd99683f45399c146dc5f3862206,de9cdd99683f4...",4,contextual proximity
5112,zaroio,cv,"82afcdd8cd1949dc974b1f298bf08783,82afcdd8cd194...",5,contextual proximity
5149,índio,nome_completo,"a72ecaf44dc34a2ea6484148f17be246,a72ecaf44dc34...",9,contextual proximity
5154,índio,teteu,"a72ecaf44dc34a2ea6484148f17be246,a72ecaf44dc34...",2,contextual proximity


### Merge both the dataframes


In [7]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ",".join, "count": "sum"})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,abacaxi,alexsandro dos santos,"520b350fd51d441daedd9a14b4bd7ea5,520b350fd51d4...",contextual proximity,3
1,abacaxi,ni,"520b350fd51d441daedd9a14b4bd7ea5,520b350fd51d4...",contextual proximity,2
2,abacaxi,soldado do tráfico,"520b350fd51d441daedd9a14b4bd7ea5,520b350fd51d4...",contextual proximity,3
3,al,alisson victor felix dos santos,"8fad39f5e9e64637ab6ad3b29a12ab37,8fad39f5e9e64...",contextual proximity,2
4,al,arapiraca,"8fad39f5e9e64637ab6ad3b29a12ab37,8fad39f5e9e64...",contextual proximity,2
...,...,...,...,...,...
2118,williams josé da silva,surugi,de9cdd99683f45399c146dc5f3862206,is,4
2119,williamys dos santos almeida,soldado do tráfico,"de9cdd99683f45399c146dc5f3862206,de9cdd99683f4...","is,contextual proximity",8
2120,zaroio,cv,"82afcdd8cd1949dc974b1f298bf08783,82afcdd8cd194...",contextual proximity,5
2121,índio,nome_completo,"a72ecaf44dc34a2ea6484148f17be246,a72ecaf44dc34...",contextual proximity,9


## Calculate the NetworkX Graph


In [8]:
nodes = pd.concat([dfg["node_1"], dfg["node_2"]], axis=0).unique()
nodes.shape

(215,)

In [9]:
import networkx as nx

G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(str(node))

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row["count"] / 4,
    )

### Calculate communities for coloring the nodes


In [10]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  4
[['abacaxi', 'alex bruno soares de mendonca', 'alexandre dos santos', 'alexandre luiz santos costa junior', 'alexsandro dos santos', 'alisson da silva ramos', 'alisson dos santos', 'alto do são miguel, rio largo', 'alyx tarciano conceição pereira', 'anderson lucas dos santos silva', 'anderson rodrigo da silva souza', 'atalaia;  maceió (cidade universitária)', 'baba rala', 'brasil novo', 'cajueiro', 'carlos alexandre araujo cirilo', 'carlos henrique vasconcelos batista', 'carlos tertulino soares da silva', 'carminha', 'casarima', 'chacal', 'chefe/líder', 'chupeta', 'claudemir lucas peixoto venâncio', 'cledson ferreira dos santos', 'cleiton primeiro dos santos', 'cleiton tiago sérgio da silva', 'clima bom', 'conj joaquim leão, vergel do lago', 'conjunto village campestre ii', 'coqueiro seco', 'coqueiro seco e conjunto imburi', 'cv', 'dan', 'danguinha', 'daniel cabral da silva', 'david dos santos pereira', 'dennison alexandre matos', 'douglas vasconcelos dos san

### Create a dataframe for community colors


In [11]:
import seaborn as sns

palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,abacaxi,#91db57,1
1,alex bruno soares de mendonca,#91db57,1
2,alexandre dos santos,#91db57,1
3,alexandre luiz santos costa junior,#91db57,1
4,alexsandro dos santos,#91db57,1
...,...,...,...
210,ratão ou chiná,#a157db,4
211,renandsom reynaldo santos queiroz,#a157db,4
212,taiane regina da silva,#a157db,4
213,tapireraguá,#a157db,4


### Add colors to the graph


In [12]:
for index, row in colors.iterrows():
    G.nodes[row["node"]]["group"] = row["group"]
    G.nodes[row["node"]]["color"] = row["color"]
    G.nodes[row["node"]]["size"] = G.degree[row["node"]]

In [13]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

# net.show(graph_output_directory, notebook=False)
net.show(graph_output_directory)