In [86]:
import pandas as pd
import re
import sqlalchemy as sa
import nltk
from nltk.corpus import stopwords
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize


In [43]:
df_proposicoes = pd.read_csv("../scripts/df_proposicoes_treino.csv")
df_proposicoes = df_proposicoes.drop(columns=["Unnamed: 0"])

In [44]:
nltk.download('stopwords')
stop_words = set(stopwords.words('portuguese'))
def limpar_keywords(keywords):
    new_keywords = [re.sub(r"\(.*\)", "", x) for x in keywords]
    new_keywords = [" ".join([word for word in x.split() if word not in stop_words]) for x in new_keywords]
    return new_keywords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arthurs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
connection = sa.create_engine("postgresql://docker:docker@localhost/tcc")
df_keywords_por_proposicao = pd.read_sql(sa.text("""
select p.id, array_agg(pk.keyword) as keywords
from tcc.proposicoes p
join tcc.proposicoes_keywords pk on pk.proposicao_id = p.id
group by p.id;
"""),connection)

In [46]:
def extrair_keywords(row):
    id = row["id_proposicao"]
    res = df_keywords_por_proposicao[df_keywords_por_proposicao["id"] == id].keywords
    if (len(res) == 0):
        return []
    keywords = res.iloc[0]
    return limpar_keywords(keywords)

df_proposicoes["keywords"] = df_proposicoes.apply(extrair_keywords, axis=1)

In [47]:
with open("./clusters_final.json", "r") as f:
    clusters = json.load(f)
    f.close()

In [48]:
clusters_inverse = {}
for key, value in clusters.items():
    for word in value:
        clusters_inverse[word] = key

In [49]:
def encontrar_clusters(row):
    current_clusters = set()
    for keyword in row["keywords"]:
        if keyword in clusters_inverse:
            current_clusters.add(clusters_inverse[keyword])
    return list(current_clusters)

In [50]:
def first_10_keywords(keywords):
    if len(keywords) <= 10:
        return keywords
    else:
        return keywords[:10]

## pegando 10 primeiras keywords de cada proposicao
df_proposicoes = df_proposicoes.assign(keywords = df_proposicoes["keywords"].apply(first_10_keywords))
df_proposicoes = df_proposicoes.assign(proposicoes_clusters = df_proposicoes.apply(encontrar_clusters, axis=1))

## Multi label classification

In [108]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier

In [51]:
df_proposicoes_com_keywords = df_proposicoes[df_proposicoes["proposicoes_clusters"].apply(lambda x: len(x) > 0)]

In [90]:
X = df_proposicoes_com_keywords["ementa_do_pdf_1pag_limpo"]
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

y = df_proposicoes_com_keywords["proposicoes_clusters"]

In [91]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Transformação das tags para o formato MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train_encoded = mlb.fit_transform(y_train)

# Escolha e treinamento do modelo
model = MultiOutputClassifier(RandomForestClassifier())  # modelo de floresta aleatória para classificação multi-rótulo
model.fit(X_train, y_train_encoded)

In [100]:
y_pred_encoded = model.predict(X_test)

y_pred = mlb.inverse_transform(y_pred_encoded)

In [107]:
for i in range(len(y_pred)):
    print(f"Ementa: {X.iloc[i][:200]}")
    print()
    print(f"Tags: {y_pred[i]}")
    break

Ementa: O objetivo presente Projeto Lei revogar inciso l art º DecretoLei nº setembro autoriza Ministério Fazenda dispor sobre isenção imposto importação âmbito regime tributação simplificada bens contidos re

Tags: ('administracao publica, politicas governamentais e direitos dos cidadaos.', 'alteracoes, regulamentacoes, procedimentos e autorizacoes legais.', 'problemas sociais, seguranca, educacao e saude publica.', 'servicos publicos, legislacao e instituicoes governamentais.')


## Tirando métricas do modelo:

In [110]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
y_test_encoded = mlb.fit_transform(y_test)

In [111]:
precision = precision_score(y_test_encoded, y_pred_encoded, average='micro')

recall = recall_score(y_test_encoded, y_pred_encoded, average='micro')

f1 = f1_score(y_test_encoded, y_pred_encoded, average='micro')

conf_matrix = confusion_matrix(y_test_encoded.argmax(axis=1), y_pred_encoded.argmax(axis=1))

print("Precisão:", precision)
print("Revocação:", recall)
print("F1-Score:", f1)
print("Matriz de Confusão:")
print(conf_matrix)

Precisão: 0.7525366403607666
Revocação: 0.6877897990726429
F1-Score: 0.7187079407806192
Matriz de Confusão:
[[318   0   0   0]
 [ 40   0   0   0]
 [  1   0   0   0]
 [  1   0   0   0]]
