In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
path = os.getcwd() + "/atribuna/milarquivos"
content_file_array = []
classifier_array = []

In [3]:
def read_text(path, file_Name):
    with open(path, encoding="latin-1") as file:
        content_file = file.read()
        Build_DataFrame(content_file, file_Name)

In [4]:
def Build_DataFrame(content_file, file_Name):
    classifier =  GetClassifier(file_Name) 
    if(len(classifier) >= 3):        
        classifier_array.append(classifier)
        content_file_array.append(content_file)  

In [5]:
def GetClassifier(file_Name):
    name_split =  file_Name.split(".")
    first_name = name_split[0];
    return first_name[(len(first_name) - 3) : len(first_name)]

In [6]:
for file_Name in tqdm(os.listdir(path), desc="Carregar arquivos"):    
    if file_Name.endswith(".txt"):
        file_path = f"{path}/{file_Name}"
      
        # call read text file function
        read_text(file_path, file_Name)

Carregar arquivos: 100%|██████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 189.27it/s]


In [7]:
textDataDrame =  pd.DataFrame({"Text":content_file_array, "Class":classifier_array})


In [8]:
textDataDrame.head(10)

Unnamed: 0,Text,Class
0,Testes nas famílias\n<b> A pedido de A Tribuna...,at2
1,A casa da sogra não é aqui\n<b> </b><br>\nELI...,at2
2,PAULO OCTAVIO\n<b> </b><br>\n <br>\nMais uma ...,at2
3,Festa de pescador no mar\n<b> A chuva não desa...,at2
4,PLANETA POP - McCartney diz ouvir a voz de Len...,at2
5,Saque de biquíni dá caldo na areia\n<b> Três d...,ept
6,Clonagem de cão vai ajudar a curar doenças\n<b...,esp
7,Maurício Prates\n<b> Uma corrida com gosto de ...,at2
8,Mesada de 10 mil para ex-servidora\n<b> Testem...,at2
9,Dirceu defende reeleição\n<b> </b><br>\n <br>...,at2


# StopWords removing

In [9]:
#!conda install unidecode

import nltk
import unidecode
from bs4 import BeautifulSoup

# nltk.download()
#- averaged_perceptron_tagger
# - floresta
# - mac_morpho
# - machado
# - punkt
# - stopwords
# - wordnet
# - words

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [10]:
def addStopWordsToList(texto):
    example_sent = texto
    word_tokens = word_tokenize(example_sent)

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    filtered_sentence = ""

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence += w + " "
    return filtered_sentence.strip()

In [11]:
stop_words = set(stopwords.words('portuguese'))
def returnDataFrameStopWords(dataFrame):
    dataReturn = pd.DataFrame()
    for row in tqdm(dataFrame.itertuples(index=True, name='Pandas'), desc="Limpeza de texto"):
        Text = row.Text
        Class = row.Class

        Text = BeautifulSoup(Text, "lxml").text
        Text = unidecode.unidecode(Text)
        Text = addStopWordsToList(Text)
        Text = Text.replace(" . "," ")
        Text = Text.replace(" , "," ")
        Text = Text.replace(" - "," ")
        Text = Text.replace(" : "," ")
        Text = Text.lower()

        linha = pd.Series([Text, Class])
        linha = pd.DataFrame([linha])
        dataReturn = pd.concat([linha, dataReturn], ignore_index=True)

    return dataReturn

In [12]:
df_text_clean = returnDataFrameStopWords(textDataDrame)

Limpeza de texto: 36it [00:00, 129.01it/s]


## Fatores de Ponderação

In [13]:
df_text_clean = df_text_clean.rename(columns = {0: 'Text', 1: 'Class'}, inplace = False)
df_text_clean.head()

Unnamed: 0,Text,Class
0,sem atendimento caixa o tecnico inspecao sider...,bro
1,craque escalado diz martinelli o secretario se...,poc
2,casa advogada saqueada serra gangue abordou ad...,poc
3,vagas trabalhar salao profissionais qualificad...,eco
4,bando faz dois assaltos predio depois assaltar...,poc


In [14]:
from sklearn.feature_extraction.text import TfidfTransformer # tf-idf
from sklearn.feature_extraction.text import CountVectorizer # tf

In [15]:
classes = []
for row in df_text_clean.itertuples(index=True, name='Pandas'):  
    if row.Class not in classes:
        classes.append(row.Class)
classes

['bro', 'poc', 'eco', 'int', 'con', 'pot', 'inf', 'opi', 'ept', 'at2', 'esp']

In [16]:
 classes =  ["at2","sro","inf","imo","ept","esp","fam","tvt","cid","mul","pot","tav","mic","opi","poc","reg","eco","bro","cit","con","int"]

## TF

In [17]:
vectorizer = CountVectorizer()
TF = vectorizer.fit_transform(df_text_clean.Text)
features_names =  vectorizer.get_feature_names()

## TF-IDF

In [18]:
from sklearn.pipeline import Pipeline

In [19]:
pipe = Pipeline([('count', CountVectorizer(vocabulary=features_names)),
                 ('tfid', TfidfTransformer())]).fit(df_text_clean.Text)

In [20]:
lista_palavras = []
for row in tqdm(df_text_clean.itertuples(index=True, name='Pandas')):
    palavras = row.Text.split(" ")
    for palavra in palavras:
        if(not len(palavra) <= 2):
            if(palavra not in lista_palavras):
                lista_palavras.append(palavra)

36it [00:00, 118.04it/s]


---

# frequencia da palavra por documento

In [21]:
df_relacao_doc = pd.DataFrame()
df_relacao_doc['Index'] = lista_palavras

for i in tqdm(df_text_clean.index):
    doc = df_text_clean.iloc[i].Text
    
    lista_quant =[]
    
    for termo in lista_palavras:
        quant = doc.split(" ").count(termo)
        lista_quant.append(quant)
        
    df_relacao_doc[f'{i}'] = lista_quant
    
    
df_relacao_doc = df_relacao_doc.set_index('Index')
df_relacao_doc.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:07<00:00,  5.06it/s]


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
atendimento,6,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
caixa,4,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
tecnico,3,0,0,0,0,0,10,0,0,0,...,0,0,0,0,0,0,0,0,0,2
inspecao,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# frequencia palavra por classe

In [22]:
dic_doc_class = {}

for classe in tqdm(classes):
    lista_doc = []
    for row in df_text_clean.itertuples(index=True, name='Pandas'):  
        if (str(classe) == row.Class):
            lista_doc.append(row.Text)
    dic_doc_class[f'{classe}'] = lista_doc

100%|████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 4201.11it/s]


In [23]:
df_relacao_class = pd.DataFrame()
df_relacao_class['Index'] = lista_palavras
for key in tqdm(dic_doc_class):
    lista_docs =  dic_doc_class[key]
    docs = ""
    for doc in lista_docs:
        docs += f' {doc}'
        
    doc = docs
    
    lista_quant =[]
    
    for termo in lista_palavras:
        quant = doc.split(" ").count(termo)
        lista_quant.append(quant)
        
    df_relacao_class[f'{key}'] = lista_quant
    
    
df_relacao_class = df_relacao_class.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:04<00:00,  4.78it/s]


In [24]:
df_relacao_class.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
atendimento,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,2,13,0,0,0
caixa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,2,4,0,0,0
tecnico,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,10,5,0,0,0
inspecao,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# ni - o numero  de documentos em que um determinado termo ki aparece

In [25]:
dic_ni = {}

for termo in tqdm(lista_palavras):
    quant = 0 
    for row in df_text_clean.itertuples(index=True, name='Pandas'):
        doc = row.Text
        if(termo in doc):
            quant += 1
    dic_ni[f'{termo}'] = quant

100%|████████████████████████████████████████████████████████████████████████████| 5185/5185 [00:01<00:00, 3113.97it/s]


In [26]:
for i in sorted(dic_ni, key = dic_ni.get, reverse=True):
    print(f'key=>{i}, maior=>{dic_ni[i]}')
    break

key=>publica, maior=>36


# a - quantidade de documentos que pertence a uma classe e posuem o termo ki

#  c - o numero  de documentos que pertencem a Cp, mas não possuem o termo ki 

In [27]:
df_a = pd.DataFrame()
df_a['Index'] = lista_palavras

df_c = pd.DataFrame()
df_c['Index'] = lista_palavras

for key in tqdm(dic_doc_class):
    lista_docs =  dic_doc_class[key]
    lista_quant_a =[]
    lista_quant_c =[]

    for termo in lista_palavras:
        quant_a = 0
        quant_c = 0
        
        for doc in lista_docs:
            if(termo in doc):
                quant_a += 1
            else:
                quant_c += 1
                
        lista_quant_a.append(quant_a)
        lista_quant_c.append(quant_c)
        
    df_a[f'{key}'] = lista_quant_a
    df_c[f'{key}'] = lista_quant_c
    
    
df_a = df_a.set_index('Index')
df_c = df_c.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 35.65it/s]


# df_a

In [28]:
df_a.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,5,0,1,0,1,0,0,0,0,0,...,0,0,1,5,0,7,2,0,1,1
atendimento,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,2,0,0,0
caixa,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,2,1,0,0,0
tecnico,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2,0,0,0
inspecao,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# df_c

In [29]:
df_c.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,4,0,0,0,1,1,0,0,0,0,...,0,0,0,2,0,3,0,0,0,0
atendimento,9,0,1,0,2,1,0,0,0,0,...,0,0,1,6,0,9,0,0,1,1
caixa,8,0,1,0,2,1,0,0,0,0,...,0,0,1,6,0,8,1,0,1,1
tecnico,8,0,1,0,2,1,0,0,0,0,...,0,0,1,7,0,9,0,0,1,1
inspecao,9,0,1,0,2,1,0,0,0,0,...,0,0,1,7,0,10,1,0,1,1


# b - a quantidade de documentos que possuem o termo ki, mas não pertence a Cp

In [30]:
df_b = pd.DataFrame()
df_b['Index'] = lista_palavras

df_d = pd.DataFrame()
df_d['Index'] = lista_palavras

for key in tqdm(dic_doc_class):
    lista_quant_b =[]
    lista_quant_d =[]
    
    for termo in lista_palavras:
        quant_b = 0
        quant_d = 0
        for classe in classes:
            if(classe != key):
                lista_docs =  dic_doc_class[classe]
                
                for doc in lista_docs:
                    if(termo in doc):
                        quant_b += 1
                    else:
                        quant_d += 1
                        
        lista_quant_b.append(quant_b)
        lista_quant_d.append(quant_d)
        
    df_b[f'{key}'] = lista_quant_b
    df_d[f'{key}'] = lista_quant_d
    
df_b = df_b.set_index('Index')
df_d = df_d.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:09<00:00,  2.16it/s]


# df_b

In [31]:
df_b.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,20,25,24,25,24,25,25,25,25,25,...,25,25,24,20,25,18,23,25,24,24
atendimento,4,4,4,4,4,4,4,4,4,4,...,4,4,4,3,4,3,2,4,4,4
caixa,5,6,6,6,6,6,6,6,6,6,...,6,6,6,5,6,4,5,6,6,6
tecnico,3,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,3,2,4,4,4
inspecao,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,1,1,1


# df_d

In [32]:
df_d.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,7,11,11,11,10,10,11,11,11,11,...,11,11,11,9,11,8,11,11,11,11
atendimento,23,32,31,32,30,31,32,32,32,32,...,32,32,31,26,32,23,32,32,31,31
caixa,22,30,29,30,28,29,30,30,30,30,...,30,30,29,24,30,22,29,30,29,29
tecnico,24,32,31,32,30,31,32,32,32,32,...,32,32,31,25,32,23,32,32,31,31
inspecao,26,35,34,35,33,34,35,35,35,35,...,35,35,34,28,35,25,34,35,34,34


---

## ICF

In [36]:
import math 

In [37]:
def get_row_class(row, classe):
    dic_classe = {
        'at2':row.at2,
        'sro':row.sro,
        'inf':row.inf,
        'imo':row.imo,
        'ept':row.ept,
        'esp':row.esp,
        'fam':row.fam,
        'tvt':row.tvt,
        'cid':row.cid,
        'mul':row.mul,
        'pot':row.pot,
        'tav':row.tav,
        'mic':row.mic,
        'opi':row.opi,
        'poc':row.poc,
        'reg':row.reg,
        'eco':row.eco,
        'bro':row.bro,
        'cit':row.cit,
        'con':row.con,
        'int':row.int
    }
    return dic_classe[f'{classe}']
    

In [38]:
# conta quantas classes o termo ki aparece
def num_ki_em_categoria():

    dic_num_ki_em_categoria ={}
    for row in tqdm(df_relacao_class.itertuples(index=True, name='Pandas')):
        ci = 0
        for classe in classes:
            if get_row_class(row,classe)!= 0:
                ci += 1
        dic_num_ki_em_categoria[f'{row.Index}'] = ci
        
    return dic_num_ki_em_categoria

# Bag of Word

In [39]:
bag_of_word = {}
bag_of_word['a'] = df_a
bag_of_word['b'] = df_b
bag_of_word['c'] = df_c
bag_of_word['d'] = df_d
bag_of_word['freq_pal_class'] = df_relacao_class
bag_of_word['freq_pal_doc'] = df_relacao_doc
bag_of_word['ci'] = num_ki_em_categoria()
bag_of_word['ni'] = dic_ni

5185it [00:00, 32512.14it/s]


In [40]:
def calculeteICF():
    dic_num_ki_em_categoria = bag_of_word['ci']
    icfs = {}
    C = len(classes)
    for key in dic_num_ki_em_categoria:
        Ci = dic_num_ki_em_categoria[key]
        valor = math.log((1 + (C/Ci)))
        icfs[f'{key}'] = valor
    return icfs

In [41]:
icfs = calculeteICF() 
icfs

{'sem': 1.8325814637483102,
 'atendimento': 2.0794415416798357,
 'caixa': 1.8325814637483102,
 'tecnico': 2.0794415416798357,
 'inspecao': 3.091042453358316,
 'siderurgico': 3.091042453358316,
 'max': 3.091042453358316,
 'sander': 3.091042453358316,
 'soneghett': 3.091042453358316,
 'bairro': 1.5040773967762742,
 'colina': 3.091042453358316,
 'laranjeiras': 2.4423470353692043,
 'serra': 2.0794415416798357,
 'quer': 1.5040773967762742,
 'saber': 1.8325814637483102,
 'porque': 1.5040773967762742,
 'centro': 1.5040773967762742,
 'comercial': 2.4423470353692043,
 'peso': 3.091042453358316,
 'ainda': 1.3862943611198906,
 'nao': 1.2039728043259361,
 'contemplado': 3.091042453358316,
 'instalacao': 3.091042453358316,
 'agencia': 1.5040773967762742,
 'economica': 2.4423470353692043,
 'federal': 1.6486586255873816,
 'temos': 2.4423470353692043,
 'hoje': 1.1314021114911006,
 'avenida': 1.6486586255873816,
 'principal': 2.4423470353692043,
 'ruas': 1.6486586255873816,
 'adjacentes': 3.09104245335

In [42]:
df_icf = pd.DataFrame()
df_icf['Index'] = df_text_clean.index

for termo in tqdm(lista_palavras):
    
    list_icfs = []
    
    for i in df_text_clean.index:
        doc = df_text_clean.iloc[i].Text
        if termo in doc.split(" "):
            list_icfs.append(icfs[termo])
        else:
            list_icfs.append(0)
    
    df_icf[f'{termo}'] = list_icfs
    
df_icf = df_icf.set_index('Index')

100%|█████████████████████████████████████████████████████████████████████████████| 5185/5185 [00:28<00:00, 181.56it/s]


In [43]:
df_icf

Unnamed: 0_level_0,sem,atendimento,caixa,tecnico,inspecao,siderurgico,max,sander,soneghett,bairro,...,acerto,interage,dialoga,impoe,limites,tratando,desafiador,espelho,avaliou,04/06/2006
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.832581,2.079442,1.832581,2.079442,3.091042,3.091042,3.091042,3.091042,3.091042,1.504077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.504077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.832581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,2.079442,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.504077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,2.079442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.832581,0.0,1.832581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

## X²

In [44]:
qui_quadrado = pd.DataFrame()
qui_quadrado['Index'] = lista_palavras

for classe in tqdm(classes):
    lista_valores = []
    for termo in lista_palavras:
        a = bag_of_word['a'].loc[termo, classe]
        b = bag_of_word['b'].loc[termo, classe]
        c = bag_of_word['c'].loc[termo, classe]
        d = bag_of_word['d'].loc[termo, classe]
        
        numerador = (len(df_text_clean) * math.pow(((a * d) - (c * d)), 2))
        denominador = ((a + c) * (b + d) * (a + b) * (c + d))

        if (numerador == 0):
            value = 0
        else:
            value = numerador/denominador

        lista_valores.append(value)

    qui_quadrado[f'{classe}'] = lista_valores

qui_quadrado = qui_quadrado.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:03<00:00,  5.35it/s]


In [45]:
qui_quadrado

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,0.026397,0,0.452571,0,0.000000,0.374026,0,0,0,0,...,0,0,0.452571,0.470112,0,0.515580,0.931765,0,0.452571,0.452571
atendimento,49.593750,0,7.722321,0,14.889706,7.722321,0,0,0,0,...,0,0,7.722321,23.414409,0,36.623077,16.941176,0,7.722321,7.722321
caixa,19.519342,0,4.805714,0,9.223529,4.805714,0,0,0,0,...,0,0,4.805714,14.187192,0,13.403077,0.000000,0,4.805714,4.805714
tecnico,32.666667,0,7.722321,0,14.889706,7.722321,0,0,0,0,...,0,0,7.722321,42.429957,0,36.623077,16.941176,0,7.722321,7.722321
inspecao,231.771429,0,33.972245,0,65.889076,33.972245,0,0,0,0,...,0,0,33.972245,194.648276,0,247.252747,0.000000,0,33.972245,33.972245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tratando,72.165577,0,16.472269,0,31.889273,16.472269,0,0,0,0,...,0,0,16.472269,93.158215,0,81.447964,31.889273,0,16.472269,16.472269
desafiador,151.200000,0,33.972245,0,65.889076,33.972245,0,0,0,0,...,0,0,33.972245,194.648276,0,247.252747,65.889076,0,33.972245,33.972245
espelho,151.200000,0,33.972245,0,65.889076,33.972245,0,0,0,0,...,0,0,33.972245,194.648276,0,247.252747,65.889076,0,33.972245,33.972245
avaliou,151.200000,0,33.972245,0,65.889076,33.972245,0,0,0,0,...,0,0,33.972245,194.648276,0,247.252747,65.889076,0,33.972245,33.972245


---

## Relevance Frequency (RF)

In [54]:
def CalculateRF():
    df_rf = pd.DataFrame()
    df_rf['Index'] = lista_palavras
    for termo in lista_palavras:
        for classe in classes:            
            a =  bag_of_word['a'].loc[termo,classe]
            b =  bag_of_word['b'].loc[termo,classe]
            value =  math.log(2 + (a / (max(1,b))))
            df_rf[f'{classe}'] = value
            
    df_rf =  df_rf.set_index('Index')
    return (df_rf)

In [56]:
CalculateRF()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
atendimento,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
caixa,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
tecnico,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
inspecao,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tratando,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
desafiador,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
espelho,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
avaliou,1.098612,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147


---

## ICF-Based

In [46]:
def Calculate_ICF_BASED():
       
    icf_based = pd.DataFrame()
    icf_based['Index'] = lista_palavras
    
    C = len(classes)
    dic_num_ki_em_categoria = bag_of_word['ci']
    
    for classe in tqdm(classes):
        valores = []
        
        for termo in lista_palavras:
            a = bag_of_word['a'].loc[termo, classe]
            b = bag_of_word['b'].loc[termo, classe]
            Ci = dic_num_ki_em_categoria[termo]

            valor = math.log((2 + (a/max(1,b)) * (C/Ci)))
            valores.append(valor)
            
        icf_based[f'{classe}'] = valores
    
    icf_based = icf_based.set_index('Index')
    
    return icf_based

In [47]:
Calculate_ICF_BASED()

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:02<00:00, 10.40it/s]


Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,1.197703,0.693147,0.796944,0.693147,0.796944,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.796944,1.197703,0.693147,1.396657,0.898746,0.693147,0.796944,0.796944
atendimento,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,1.466337,0.693147,1.466337,2.197225,0.693147,0.693147,0.693147
caixa,1.115142,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,1.115142,0.693147,1.531476,1.115142,0.693147,0.693147,0.693147
tecnico,1.466337,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,1.466337,2.197225,0.693147,0.693147,0.693147
inspecao,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,3.135494,0.693147,0.693147,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tratando,3.135494,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,3.135494,0.693147,0.693147,0.693147,0.693147
desafiador,3.135494,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
espelho,3.135494,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
avaliou,3.135494,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147


---

## RF*IDF

In [57]:
def CalculateIDF():
    idfDict = {}
    N = len(df_text_clean)

    for termo, value in tqdm(bag_of_word['ni'].items()):
        idfDict[termo] = math.log10(N / (float(value)))
        
    return(idfDict)

In [58]:
def CalculateRF_IDF():
    df_rfIdf = pd.DataFrame()
    df_rfIdf['Index'] = df_rf.index
    
    for index in df_rf.index:
        for classe in classes:            
            valueRF =  df_rf.loc[index,classe]
            valueIdf =  idfDict[index]
            value =  valueRF * valueIdf
            df_rfIdf[f'{classe}'] = value
            
    df_rfIdf =  df_rfIdf.set_index('Index')
    return (df_rfIdf)

In [59]:
CalculateRF_IDF()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sem,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
atendimento,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
caixa,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
tecnico,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
inspecao,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tratando,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
desafiador,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
espelho,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747
avaliou,1.709773,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,...,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747,1.078747


---

## Machine Learning

### KNN

In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve,classification_report,r2_score
from sklearn import model_selection as ms
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
le = LabelEncoder()

In [3]:
def toNumeric(nome_coluna, coluna):
    df_text_clean[nome_coluna] = le.fit_transform (coluna)

In [4]:
toNumeric('numeric_class',df_text_clean.Class)

NameError: name 'df_text_clean' is not defined

In [5]:
# features
X = TF

NameError: name 'TF' is not defined

In [6]:
#response
Y = df_text_clean['numeric_class'].copy()

NameError: name 'df_text_clean' is not defined

In [7]:
x_train, x_test, y_train, y_test = ms.train_test_split(X,Y,test_size=0.33, random_state=5)

NameError: name 'X' is not defined

In [8]:
model_knn = KNeighborsClassifier(n_neighbors=1)

In [9]:
model_knn.fit(x_train, y_train)

NameError: name 'x_train' is not defined

In [10]:
pred = model_knn.predict_proba(x_test)

NameError: name 'x_test' is not defined

In [11]:
plot_roc_curve(np.array(y_test), pred[:,1])

NameError: name 'plot_roc_curve' is not defined

# Imports do EDA

In [12]:
# print da curva ROC
def plot_roc_curve(y_true, y_score, figsize=(10,6)):
    fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=2)
    plt.figure(figsize=figsize)
    auc_value = roc_auc_score(y_true, y_score)
    plt.plot(fpr, tpr, color='orange', label='ROC curve (area = %0.2f)' % auc_value)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('Taxa de falso positivo')
    plt.ylabel('Taxa de verdadeiro positivo')
    plt.title('Curva ROC')
    plt.legend()
    lt.show()

In [13]:
#precision    recall  f1-score   support
print(classification_report(y_test, pred, labels=[1, 2, 3]))

NameError: name 'y_test' is not defined

In [14]:
# AUC
print('AUC: %0.2f' % roc_auc_score(y_test, pred))

NameError: name 'y_test' is not defined

## Número de amostras de cade classe


In [15]:
import seaborn as sns

In [16]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.countplot(x = 'Class',data = textDataDrame)

NameError: name 'textDataDrame' is not defined