In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
path = os.getcwd() + "/atribuna/milarquivos"
content_file_array = []
classifier_array = []

## Construção do DataFrame

In [3]:
def read_text(path, file_Name):
    with open(path, encoding="latin-1") as file:
        content_file = file.read()
        Build_DataFrame(content_file, file_Name)

In [4]:
def GetClassifier(file_Name):
    name_split =  file_Name.split(".")
    first_name = name_split[0];
    return first_name[(len(first_name) - 3) : len(first_name)]

In [5]:
def Build_DataFrame(content_file, file_Name):
    classifier =  GetClassifier(file_Name) 
    if(len(classifier) >= 3):        
        classifier_array.append(classifier)
        content_file_array.append(content_file)  

In [6]:
for file_Name in tqdm(os.listdir(path), desc="Carregar arquivos"):    
    if file_Name.endswith(".txt"):
        file_path = f"{path}/{file_Name}"
      
        # call read text file function
        read_text(file_path, file_Name)

Carregar arquivos: 100%|██████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 11050.58it/s]


In [7]:
textDataDrame =  pd.DataFrame({"Text":content_file_array, "Class":classifier_array})

In [8]:
textDataDrame.head(10)

Unnamed: 0,Text,Class
0,Testes nas famílias\n<b> A pedido de A Tribuna...,at2
1,"Deu borboleta na moda\n<b> A borboleta, que fo...",at2
2,Roupas de malha vão à festa\n<b> Se antes as p...,at2
3,Chega de dor-de-cotovelo\n<b> Com muito humor ...,at2
4,Tom Cruise volta com tudo\n<b> Ação sem fim e ...,at2
5,Temporada de arraiás\n<b> Sejam as tradicionai...,at2
6,"Bregas ontem, chiques hoje\n<b> Ídolos no pass...",at2
7,Picape boa de briga\n<b> Testada por Sobre Rod...,sro
8,Rita Cadilac vai se aposentar\n<b> A ex-chacre...,at2
9,"Cada balada, um visual\n<b> A turma das raves ...",at2


# StopWords removing

In [9]:
#!conda install unidecode

import nltk
import unidecode
from bs4 import BeautifulSoup

# nltk.download()
#- averaged_perceptron_tagger
# - floresta
# - mac_morpho
# - machado
# - punkt
# - stopwords
# - wordnet
# - words

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [10]:
def addStopWordsToList(texto):
    example_sent = texto
    word_tokens = word_tokenize(example_sent)

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    filtered_sentence = ""

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence += w + " "
    return filtered_sentence.strip()

In [11]:
stop_words = set(stopwords.words('portuguese'))
def returnDataFrameStopWords(dataFrame):
    dataReturn = pd.DataFrame()
    for row in tqdm(dataFrame.itertuples(index=True, name='Pandas'), desc="Limpeza de texto"):
        Text = row.Text
        Class = row.Class

        Text = BeautifulSoup(Text, "lxml").text
        Text = unidecode.unidecode(Text)
        Text = addStopWordsToList(Text)
        Text = Text.replace(" . "," ")
        Text = Text.replace(" , "," ")
        Text = Text.replace(" - "," ")
        Text = Text.replace(" : "," ")
        Text = Text.lower()

        linha = pd.Series([Text, Class])
        linha = pd.DataFrame([linha])
        dataReturn = pd.concat([linha, dataReturn], ignore_index=True)

    return dataReturn

In [12]:
df_text_clean = returnDataFrameStopWords(textDataDrame)

Limpeza de texto: 200it [00:01, 121.63it/s]


## Fatores de Ponderação

In [13]:
df_text_clean = df_text_clean.rename(columns = {0: 'Text', 1: 'Class'}, inplace = False)
df_text_clean.head()

Unnamed: 0,Text,Class
0,um cidadao mundo em 30 anos correspondente int...,at2
1,televisao sbt exibe desenhos marcaram 25 anos ...,at2
2,clovis rossi o brasil nao oferece educacao sau...,at2
3,testes familias a pedido a tribuna terapeutas ...,at2
4,desafio brasil pobreza diz rice secretaria est...,at2


## Importando libs do scikit-learn

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf
from sklearn.feature_extraction.text import CountVectorizer # tf

## Criando a lista de classes

In [15]:
 classes =  ["at2","sro","inf","imo","ept","esp","fam","tvt","cid","mul","pot","tav","mic","opi","poc","reg","eco","bro","cit","con","int"]

## TF

In [16]:
vectorizer = CountVectorizer()
TF = vectorizer.fit_transform(df_text_clean.Text)

## TF-IDF

In [17]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_text_clean.Text)
df_tfidf = X.toarray()

# Lista de palavras

In [18]:
lista_palavras = []
for row in tqdm(df_text_clean.itertuples(index=True, name='Pandas')):
    palavras = row.Text.split(" ")
    for palavra in palavras:
        if(not len(palavra) <= 2):
            if(palavra not in lista_palavras):
                lista_palavras.append(palavra)

200it [00:03, 52.84it/s]


---

# frequencia palavra por classe

In [19]:
dic_doc_class = {}

for classe in tqdm(classes):
    lista_doc = []
    for row in df_text_clean.itertuples(index=True, name='Pandas'):  
        if (str(classe) == row.Class):
            lista_doc.append(row.Text)
    dic_doc_class[f'{classe}'] = lista_doc

100%|████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 1582.90it/s]


In [20]:
from collections import Counter

In [21]:
df_relacao_class = pd.DataFrame()
df_relacao_class['Index'] = lista_palavras
for key in tqdm(dic_doc_class,"classes"):
    lista_docs =  dic_doc_class[key]
    docs = ""
    for doc in lista_docs:
        docs += f' {doc}'
        
    lista_quant =[]
    count = Counter(docs.split(" "))
    for termo in lista_palavras:
        quant = count[termo]
        lista_quant.append(quant)
        
    df_relacao_class[f'{key}'] = lista_quant
    
    
df_relacao_class = df_relacao_class.set_index('Index')

classes: 100%|████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 113.67it/s]


In [22]:
df_relacao_class.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cidadao,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mundo,99,2,3,1,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
anos,382,9,0,0,3,2,0,0,0,0,...,0,0,3,1,0,0,0,0,0,0
correspondente,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
internacional,26,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# ni - o numero  de documentos em que um determinado termo ki aparece

In [23]:
dic_ni = {}

for termo in tqdm(lista_palavras):
    quant = 0 
    for row in df_text_clean.itertuples(index=True, name='Pandas'):
        doc = row.Text
        if(termo in doc):
            quant += 1
    dic_ni[f'{termo}'] = quant

100%|███████████████████████████████████████████████████████████████████████████| 17057/17057 [00:17<00:00, 965.98it/s]


# a - quantidade de documentos que pertence a uma classe e posuem o termo ki

#  c - o numero  de documentos que pertencem a Cp, mas não possuem o termo ki 

In [24]:
df_a = pd.DataFrame()
df_a['Index'] = lista_palavras

df_c = pd.DataFrame()
df_c['Index'] = lista_palavras

for key in tqdm(dic_doc_class):
    lista_docs =  dic_doc_class[key]
    lista_quant_a =[]
    lista_quant_c =[]

    for termo in lista_palavras:
        quant_a = 0
        quant_c = 0
        
        for doc in lista_docs:
            if(termo in doc):
                quant_a += 1
            else:
                quant_c += 1
                
        lista_quant_a.append(quant_a)
        lista_quant_c.append(quant_c)
        
    df_a[f'{key}'] = lista_quant_a
    df_c[f'{key}'] = lista_quant_c
    
    
df_a = df_a.set_index('Index')
df_c = df_c.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:08<00:00,  2.53it/s]


# df_a

In [25]:
df_a.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cidadao,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mundo,60,2,2,1,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
anos,128,10,0,1,1,2,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
correspondente,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
internacional,16,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# df_c

In [26]:
df_c.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cidadao,159,20,3,2,3,2,0,0,0,0,...,0,0,1,2,0,2,0,0,0,0
mundo,104,18,1,1,3,1,0,0,0,0,...,0,0,0,2,0,1,0,0,0,0
anos,36,10,3,1,2,0,0,0,0,0,...,0,0,0,1,0,2,0,0,0,0
correspondente,163,20,3,2,3,2,0,0,0,0,...,0,0,1,2,0,2,0,0,0,0
internacional,148,19,3,1,3,2,0,0,0,0,...,0,0,1,2,0,2,0,0,0,0


# b - a quantidade de documentos que possuem o termo ki, mas não pertence a Cp

# d o numero de documentos que não possuem ki e não pertencem a Cp.


In [27]:
df_b = pd.DataFrame()
df_b['Index'] = lista_palavras

df_d = pd.DataFrame()
df_d['Index'] = lista_palavras

for key in tqdm(dic_doc_class):
    lista_quant_b =[]
    lista_quant_d =[]
    
    for termo in lista_palavras:
        quant_b = 0
        quant_d = 0
        for classe in classes:
            if(classe != key):
                lista_docs =  dic_doc_class[classe]
                
                for doc in lista_docs:
                    if(termo in doc):
                        quant_b += 1
                    else:
                        quant_d += 1
                        
        lista_quant_b.append(quant_b)
        lista_quant_d.append(quant_d)
        
    df_b[f'{key}'] = lista_quant_b
    df_d[f'{key}'] = lista_quant_d
    
df_b = df_b.set_index('Index')
df_d = df_d.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [02:41<00:00,  7.69s/it]


# df_b

In [28]:
df_b.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cidadao,0,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
mundo,8,66,66,67,68,67,68,68,68,68,...,68,68,67,68,68,67,68,68,68,68
anos,16,134,144,143,143,142,144,144,144,144,...,144,144,143,143,144,144,144,144,144,144
correspondente,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
internacional,2,17,18,17,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18


# df_d

In [29]:
df_d.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cidadao,36,175,192,193,192,193,195,195,195,195,...,195,195,194,193,195,193,195,195,195,195
mundo,28,114,131,131,129,131,132,132,132,132,...,132,132,132,130,132,131,132,132,132,132
anos,20,46,53,55,54,56,56,56,56,56,...,56,56,56,55,56,54,56,56,56,56
correspondente,36,179,196,197,196,197,199,199,199,199,...,199,199,198,197,199,197,199,199,199,199
internacional,34,163,179,181,179,180,182,182,182,182,...,182,182,181,180,182,180,182,182,182,182


---

## Conta quantas classes o termo ki aparece

In [30]:
def get_row_class(row, classe):
    dic_classe = {
        'at2':row.at2,
        'sro':row.sro,
        'inf':row.inf,
        'imo':row.imo,
        'ept':row.ept,
        'esp':row.esp,
        'fam':row.fam,
        'tvt':row.tvt,
        'cid':row.cid,
        'mul':row.mul,
        'pot':row.pot,
        'tav':row.tav,
        'mic':row.mic,
        'opi':row.opi,
        'poc':row.poc,
        'reg':row.reg,
        'eco':row.eco,
        'bro':row.bro,
        'cit':row.cit,
        'con':row.con,
        'int':row.int
    }
    return dic_classe[f'{classe}']
    

In [31]:
def num_ki_em_categoria():

    dic_num_ki_em_categoria ={}
    for row in tqdm(df_relacao_class.itertuples(index=True, name='Pandas')):
        ci = 0
        for classe in classes:
            if get_row_class(row,classe)!= 0:
                ci += 1
        dic_num_ki_em_categoria[f'{row.Index}'] = ci
        
    return dic_num_ki_em_categoria

# Bag of Word

In [32]:
bag_of_word = {}
bag_of_word['a'] = df_a
bag_of_word['b'] = df_b
bag_of_word['c'] = df_c
bag_of_word['d'] = df_d
bag_of_word['freq_pal_class'] = df_relacao_class
bag_of_word['ci'] = num_ki_em_categoria()
bag_of_word['ni'] = dic_ni

17057it [00:00, 39167.79it/s]


## ICF

In [33]:
import math 

In [34]:
def calculeteICF():
    dic_num_ki_em_categoria = bag_of_word['ci']
    icfs = {}
    C = len(classes)
    for key in dic_num_ki_em_categoria:
        Ci = dic_num_ki_em_categoria[key]
        valor = math.log((1 + (C/Ci)))
        icfs[f'{key}'] = valor
    return icfs

In [35]:
icfs = calculeteICF() 

In [36]:
df_icf = pd.DataFrame()
df_icf['Index'] = df_text_clean.index

for termo in tqdm(lista_palavras):
    
    list_icfs = []
    
    for row in df_text_clean.itertuples(index=True, name='Pandas'):
        doc = row.Text
        if termo in doc:
            list_icfs.append(icfs[termo])
        else:
            list_icfs.append(0)
    
    df_icf[f'{termo}'] = list_icfs
    
df_icf = df_icf.set_index('Index')

100%|███████████████████████████████████████████████████████████████████████████| 17057/17057 [00:35<00:00, 484.53it/s]


In [37]:
df_icf.head()

Unnamed: 0_level_0,cidadao,mundo,anos,correspondente,internacional,silio,boccanera,presenciou,revolucoes,guerra,...,mansinho,borboletas,libera,betty,situar,discretamente,tonam,unanimidade,opina,01/02/2003
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.091042,1.386294,1.504077,3.091042,2.079442,3.091042,3.091042,3.091042,3.091042,3.091042,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.504077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.504077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.504077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.091042,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.091042,0.0,0.0,0.0,2.079442,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

## Calculando IDF

In [38]:
idfDict = {}
N = len(df_text_clean)

for termo, value in tqdm(bag_of_word['ni'].items()):
    idfDict[termo] = math.log10(N / (float(value)))

100%|███████████████████████████████████████████████████████████████████████| 17057/17057 [00:00<00:00, 1661261.89it/s]


# Calculo qui, rf, icf based, rf*idf

In [39]:
df_qui_quadrado = pd.DataFrame()
df_qui_quadrado['Index'] = lista_palavras

df_rf = pd.DataFrame()
df_rf['Index'] = lista_palavras

df_icf_based = pd.DataFrame()
df_icf_based['Index'] = lista_palavras

C = len(classes)
dic_num_ki_em_categoria = bag_of_word['ci']

df_rfIdf = pd.DataFrame()
df_rfIdf['Index'] = lista_palavras
    

for classe in tqdm(classes):
    
    lista_valores_qui = []
    lista_valores_rf = []
    lista_valores_icf_based = []
    lista_valores_rfidf = []
    
    for termo in lista_palavras:
        a = bag_of_word['a'].loc[termo, classe]
        b = bag_of_word['b'].loc[termo, classe]
        c = bag_of_word['c'].loc[termo, classe]
        d = bag_of_word['d'].loc[termo, classe]
        
        # qui
        numerador = (len(df_text_clean) * math.pow(((a * d) - (c * d)), 2))
        denominador = ((a + c) * (b + d) * (a + b) * (c + d))

        if (numerador == 0):
            value_qui = 0
        else:
            value_qui = numerador/denominador
            
        # RF
        value_rf =  math.log(2 + (a / (max(1,b))))
        
        #ICF-based
        Ci = dic_num_ki_em_categoria[termo]
        valor_icf_based = math.log((2 + (a/max(1,b)) * (C/Ci)))
        
        #RF*IDF
        valueIdf =  idfDict[termo]
        value_rfidf =  value_rf * valueIdf
        
        #adicionando a listas
        lista_valores_qui.append(value_qui)
        lista_valores_rf.append(value_rf)
        lista_valores_icf_based.append(valor_icf_based)
        lista_valores_rfidf.append(value_rfidf)

    df_qui_quadrado[f'{classe}'] = lista_valores_qui
    df_rf[f'{classe}'] = lista_valores_rf
    df_icf_based[f'{classe}'] = lista_valores_icf_based
    df_rfIdf[f'{classe}'] = lista_valores_rfidf

df_qui_quadrado = df_qui_quadrado.set_index('Index')
df_rf = df_rf.set_index('Index')
df_icf_based = df_icf_based.set_index('Index')
df_rfIdf = df_rfIdf.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:11<00:00,  1.85it/s]


## Construção dataframe documentos x termos 

In [None]:
df_doc_rf= pd.DataFrame()
df_doc_rf['Index'] = df_text_clean.index

df_doc_qui= pd.DataFrame()
df_doc_qui['Index'] = df_text_clean.index

df_doc_icf_based= pd.DataFrame()
df_doc_icf_based['Index'] = df_text_clean.index

df_doc_rfIdf= pd.DataFrame()
df_doc_rfIdf['Index'] = df_text_clean.index

for termo in tqdm(lista_palavras):
    
    list_rf = []
    list_qui = []
    list_icf_based = []
    list_rfIdf = []
    
    
    for row in df_text_clean.itertuples(index=True, name='Pandas'):
        doc = row.Text
        classe = row.Class
        
        if termo in doc:
            value_rf = df_rf.loc[termo,classe]
            value_qui = df_qui_quadrado.loc[termo,classe]
            value_icf_based = df_icf_based.loc[termo,classe]
            value_rfIdf = df_rfIdf.loc[termo,classe]
            
            list_rf.append(value_rf)
            list_qui.append(value_qui)
            list_icf_based.append(value_icf_based)
            list_rfIdf.append(value_rfIdf)
        else:
            list_rf.append(0)
            list_qui.append(0)
            list_icf_based.append(0)
            list_rfIdf.append(0)
    
    df_doc_rf[f'{termo}'] = list_rf
    df_doc_qui[f'{termo}'] = list_qui
    df_doc_icf_based[f'{termo}'] = list_icf_based
    df_doc_rfIdf[f'{termo}'] = list_rfIdf
    
    
df_doc_rf = df_doc_rf.set_index('Index')
df_doc_qui= df_doc_qui.set_index('Index')
df_doc_icf_based = df_doc_icf_based.set_index('Index')
df_doc_rfIdf = df_doc_rfIdf.set_index('Index')

 60%|█████████████████████████████████████████████▏                             | 10265/17057 [00:40<00:33, 199.97it/s]

---

# Machine Learning

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.metrics import roc_auc_score, roc_curve,classification_report,r2_score
from sklearn import model_selection as ms
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.neighbors import NearestCentroid

In [None]:
def metricas(acuracia, precision,recal, micro_f1, macro_f1):
    print(f'Acurácia: %.2f' % (acuracia*100))
    print(f'precision: %.2f' % (precision*100))
    print(f'recal: %.2f' % (recal*100))
    print(f'micro-f1: %.2f' % (micro_f1*100))
    print(f'macro-f1: %.2f' % (macro_f1*100))

In [None]:
dic_dfs = {}
dic_dfs["TF"] = TF
dic_dfs["TF-IDF"] = df_tfidf
dic_dfs["ICF"] = df_icf
dic_dfs["X²"] = df_doc_qui
dic_dfs["ICF-based"] = df_doc_icf_based
dic_dfs["RF"] = df_doc_rf
dic_dfs["rfXidf"] = df_doc_rfIdf

# KNN

### data frame de resultados

In [None]:
data_results_knn = pd.DataFrame()

In [None]:
data_results_knn['Index'] = dic_dfs.keys()
lista_acuracia =[]
lista_precision =[]
lista_recal =[]
lista_mif1 =[]
lista_maf1 =[]

for key in dic_dfs:
    # features
    X = dic_dfs[key]

    #response
    Y = df_text_clean.Class
    
    #treino e teste
    x_train, x_test, y_train, y_test = ms.train_test_split(X,Y,test_size=0.33, random_state=5)
    
    #model description
    model_knn = KNeighborsClassifier(n_neighbors=1)
    
    #train
    model_knn.fit(x_train, y_train)
    
    #pred
    pred = model_knn.predict(x_test)
    
    
    acuracia = metrics.accuracy_score(y_test, pred)
    lista_acuracia.append(acuracia)
    
    precision = precision_score(y_test, pred, average='weighted')
    lista_precision.append(precision)
    
    recal = recall_score(y_test, pred, average='weighted')
    lista_recal.append(recal)
    
    micro_f1 = f1_score(y_test, pred, average='micro')
    lista_mif1.append(micro_f1)
    
    macro_f1 = f1_score(y_test, pred, average='macro')
    lista_maf1.append(macro_f1)
    
    print(f"********** PONDERAÇÂO {key}")
    metricas(acuracia, precision, recal, micro_f1, macro_f1)
    
data_results_knn['acuracia'] = lista_acuracia
data_results_knn['precision'] = lista_precision
data_results_knn['recal'] = lista_recal
data_results_knn['micro-f1'] = lista_mif1
data_results_knn['macro-f1'] = lista_maf1
    
data_results_knn = data_results_knn.set_index('Index')

# Metricas KNN para cada ponderação

In [None]:
data_results_knn

# Centroid

### data frame de resultados

In [None]:
data_results_centroid = pd.DataFrame()

In [None]:
data_results_centroid['Index'] = dic_dfs.keys()
lista_acuracia =[]
lista_precision =[]
lista_recal =[]
lista_mif1 =[]
lista_maf1 =[]

for key in dic_dfs:
    # features
    X = dic_dfs[key]

    #response
    Y = df_text_clean.Class
    
    #treino e teste
    x_train, x_test, y_train, y_test = ms.train_test_split(X,Y,test_size=0.33, random_state=5)
    
    #model description
    model_centroid = NearestCentroid()
    
    #train
    model_centroid.fit(x_train, y_train)
    
    #pred
    pred = model_centroid.predict(x_test)
    
    acuracia = metrics.accuracy_score(y_test, pred)
    lista_acuracia.append(acuracia)
    
    precision = precision_score(y_test, pred, average='weighted')
    lista_precision.append(precision)
    
    recal = recall_score(y_test, pred, average='weighted')
    lista_recal.append(recal)
    
    micro_f1 = f1_score(y_test, pred, average='micro')
    lista_mif1.append(micro_f1)
    
    macro_f1 = f1_score(y_test, pred, average='macro')
    lista_maf1.append(macro_f1)
    
    print(f"********** PONDERAÇÂO {key}")
    metricas(acuracia, precision, recal, micro_f1, macro_f1)
    
data_results_centroid['acuracia'] = lista_acuracia
data_results_centroid['precision'] = lista_precision
data_results_centroid['recal'] = lista_recal
data_results_centroid['micro-f1'] = lista_mif1
data_results_centroid['macro-f1'] = lista_maf1
    
data_results_centroid = data_results_centroid.set_index('Index')

# Metricas Centroid para cada ponderação

In [None]:
data_results_centroid