## Setup


In [14]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import (
    PyPDFLoader,
    UnstructuredPDFLoader,
    PyPDFium2Loader,
)
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "faccionados"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents


In [15]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

100%|██████████| 1/1 [00:00<00:00,  9.40it/s]

Number of chunks =  15
VILLAGE CAMPESTRE


51
ALEX BRUNO SOARES DE MENDONCA
LELECO
NI
CHEFE/LÍDER



CONJUNTO VILLAGE CAMPESTRE II


52
JOSE FERNANDO DE OLIVEIRA LEITE
MANCHINHA
NI
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


53
EVERALDO ALEXANDRE DA SILVA FILHO
CHINA
NI
GERENTE



JARDINEIRA- JACINTINHO


54
CLEITON TIAGO SÉRGIO DA SILVA

PCC
PAIOL/GUARDA-ROUPAS



JARDINEIRA- JACINTINHO


55
GEOVANE BARBOSA SILVA

PCC
SOLDADO DO TRÁFICO



JARDINEIRA- JACINTINHO


56
ALISSON DA SILVA RAMOS
GIBI
PCC
GERENTE



REGINALDO/FEITOSA


57
CAIUO GABRIEL DOS SANTOS SILVA
GORDINHO
PCC
SOLDADO DO TRÁFICO



FEITOSA


58
ANDERSON RODRIGO DA SILVA SOUZA
GRINGO
PCC
SOLDADO DO TRÁFICO



FEITOSA, CRUZEIRO DO SUL, ROSANE COLLOR, CLIMA BOM, BOM PARTO


59
ALEXANDRE VERÇOSA DOS SANTOS JUNIOR
PANELA
PCC
GERENTE



FEITOSA


60
BRENO GOMES DE OLIVEIRA
LIMA
PCC
SOLDADO DO TRÁFICO



VALE DO REGINALDO E FEITOSA


61
EDUARDO HENRIQUE GOMES DA SILVA

PCC
SOLDADO DO TRÁFICO



FEITOSA


62
JOSE MACIEL SILV




## Create a dataframe of all the chunks


In [16]:
from helpers.df_helpers import documents2Dataframe

df = documents2Dataframe(pages)
print(df.shape)
df.head()

(15, 3)


Unnamed: 0,text,source,chunk_id
0,1\nKAYO NASCIMENTO DE MAGALHÃES\nCABRA/ 99\nNI...,data_input/faccionados/faccionados_analise_vin...,18f9dc7573d5473ca9719ffc430ac9f5
1,NI\nCHEFE/LÍDER\n\n\n\nMACEIÓ I\n\n\n20\nJOSÉ ...,data_input/faccionados/faccionados_analise_vin...,a167528bb20246aeb7aa4d24796f3ee2
2,ESCADINHA (VIRGEM DOS POBRES I)\n\n\n33\nJEFFE...,data_input/faccionados/faccionados_analise_vin...,24da9f5333c9439299d1dab86147ebd1
3,VILLAGE CAMPESTRE\n\n\n51\nALEX BRUNO SOARES D...,data_input/faccionados/faccionados_analise_vin...,1f25540ca7db49fe941358dce11ab0c1
4,67\nANDERSON RODRIGO DA SILVA SOUZA\nDE BOA E ...,data_input/faccionados/faccionados_analise_vin...,1db70cc186394a46a0d97571d7d61560


## Extract Concepts


In [17]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again.

        dfne = dataframe of edges

        df = dataframe of chunks

Else the dataframes are read from the output directory


In [18]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model="zephyr:latest")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)

    dfg1.to_csv(outputdirectory / "graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory / "chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory / "graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", "edge"], inplace=True)
dfg1["count"] = 4
## Increasing the weight of the relation to 4.
## We will assign the weight of 1 when later the contextual proximity will be calculated.
print(dfg1.shape)
dfg1.head()

[
   {
       "node_1": "KAYO NASCIMENTO DE MAGALHÃES",
       "node_2": "CLIMA BOM/FRONTEIRA",
       "edge": "located in or near KAYO NASCIMENTO DE MAGALHÃES with a good climate"
   },
   {
       "node_1": "KAYO NASCIMENTO DE MAGALHÃES",
       "node_2": "CABRA",
       "edge": "located in or near KAYO NASCIMENTO DE MAGALHÃES with a place named CABRA"
   },
   {
       "node_1": "JOSÉ MARIANO DOS SANTOS",
       "node_2": "BRAÇO DIREITO",
       "edge": "has the role of BRAÇO DIREITO as JOSÉ MARIANO DOS SANTOS"
   },
   {
       "node_1": "GIVALDO BARBOSA DE FRANÇA",
       "node_2": "QUINZINHO",
       "edge": "is the leader or chief of QUINZINHO as GIVALDO BARBOSA DE FRANÇA"
   },
   {
       "node_1": "JOSÉ ERIVALDO DA PAZ",
       "node_2": "ÍNDIO",
       "edge": "is the leader or chief of ÍNDIO as JOSÉ ERIVALDO DA PAZ"
   },
   {
       "node_1": "LUCAS MANOEL DA SILVA",
       "node_2": "PROJETO",
       "edge": "is involved in PROJETO as LUCAS MANOEL DA SILVA"
   },
   {
   

## Calculating contextual proximity


In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
5343,wilson luis dos santos da silva,barra grande - maragogi,"18348570fc7d403e9cf4c5e62693d7b3,18348570fc7d4...",2,contextual proximity
5354,wilson luis dos santos da silva,marechal de odoró,"18348570fc7d403e9cf4c5e62693d7b3,18348570fc7d4...",2,contextual proximity
5357,wilson luis dos santos da silva,ni,"18348570fc7d403e9cf4c5e62693d7b3,18348570fc7d4...",3,contextual proximity
5407,índio,nem catenga / jovem,"d151b934ab1f47ad928d70f924612e9b,d151b934ab1f4...",2,contextual proximity
5408,índio,piabas- jacintinho,"d151b934ab1f47ad928d70f924612e9b,d151b934ab1f4...",2,contextual proximity


### Merge both the dataframes


In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ",".join, "count": "sum"})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,11º bpm,al,"c9a591cee08442888b5eb897f5f96eda,c9a591cee0844...",contextual proximity,2
1,11º bpm,manoel teles,"c9a591cee08442888b5eb897f5f96eda,c9a591cee0844...",contextual proximity,3
2,ailton da silva filho,favela dandara,"5c25cf2f9d26439d990bfd004c0c1ef5,5c25cf2f9d264...","resides in or associated with,contextual proxi...",6
3,ailton da silva filho,marechal deodoro,"5c25cf2f9d26439d990bfd004c0c1ef5,5c25cf2f9d264...",contextual proximity,4
4,ailton da silva filho,"vergel do lago, levida e ponta grossa","5c25cf2f9d26439d990bfd004c0c1ef5,5c25cf2f9d264...",contextual proximity,3
...,...,...,...,...,...
1249,wilson luis dos santos da silva,barra grande - maragogi,"18348570fc7d403e9cf4c5e62693d7b3,18348570fc7d4...",contextual proximity,2
1250,wilson luis dos santos da silva,marechal de odoró,"18348570fc7d403e9cf4c5e62693d7b3,18348570fc7d4...",contextual proximity,2
1251,wilson luis dos santos da silva,ni,"18348570fc7d403e9cf4c5e62693d7b3,18348570fc7d4...",contextual proximity,3
1252,índio,nem catenga / jovem,"d151b934ab1f47ad928d70f924612e9b,d151b934ab1f4...",contextual proximity,2


## Calculate the NetworkX Graph


In [None]:
nodes = pd.concat([dfg["node_1"], dfg["node_2"]], axis=0).unique()
nodes.shape

(204,)

In [None]:
import networkx as nx

G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(str(node))

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row["count"] / 4,
    )

### Calculate communities for coloring the nodes


In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  14
[['11º bpm', 'ailton da silva filho', 'al', 'alan diego da conceição', 'alan ângelo da silva', 'alex bruno soares de mendonca', 'alexandro de farias barros santos filho', 'alexsandro dos santos', 'alisson dos santos', 'alisson victor felix dos santos', 'ana thalita da silva', 'anderson luan silva dos santos', 'andre lucas dos santos silva', 'arabidalas rodrigues da silva', 'baba rala', 'babidi', 'baiano', 'barra grande - maragogi', 'barra nova', 'baxinho', 'belo matador', 'belô matador', 'carlos alexandre araújo cirilo', 'carlos andré dos santos', 'carlos messias dos santos', 'chacal', 'charles miller galvão cavaleiro', 'chefe/líder', 'cidade sorriso i', 'claudemir lucas peixoto venâncio', 'clima bom', 'clima bom (principalmente rosane collor e colibri)', 'conj joaquim leão, vergel do lago', 'conjunto barnabé toledo - marechal de odoró', 'conjunto barnabé toledo - marechal deodoro', 'conjunto são caetano', 'cv', 'daiane franciele da silva', 'daniel augusto g

### Create a dataframe for community colors


In [None]:
import seaborn as sns

palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,11º bpm,#db5788,1
1,ailton da silva filho,#db5788,1
2,al,#db5788,1
3,alan diego da conceição,#db5788,1
4,alan ângelo da silva,#db5788,1
...,...,...,...
199,vaqueiro,#db57c0,12
200,silvestre santos da silva,#db5f57,13
201,tubarão/tubarão da vm/ tubarão da vm 03,#db5f57,13
202,taiane regina da silva,#db9757,14


### Add colors to the graph


In [None]:
for index, row in colors.iterrows():
    G.nodes[row["node"]]["group"] = row["group"]
    G.nodes[row["node"]]["color"] = row["color"]
    G.nodes[row["node"]]["size"] = G.degree[row["node"]]

In [None]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

# net.show(graph_output_directory, notebook=False)
net.show(graph_output_directory)