In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
path = os.getcwd() + "/atribuna/milarquivos"
content_file_array = []
classifier_array = []

In [3]:
def read_text(path, file_Name):
    with open(path, encoding="latin-1") as file:
        content_file = file.read()
        Build_DataFrame(content_file, file_Name)

In [4]:
def Build_DataFrame(content_file, file_Name):
    classifier =  GetClassifier(file_Name) 
    if(len(classifier) >= 3):        
        classifier_array.append(classifier)
        content_file_array.append(content_file)  

In [5]:
def GetClassifier(file_Name):
    name_split =  file_Name.split(".")
    first_name = name_split[0];
    return first_name[(len(first_name) - 3) : len(first_name)]

In [6]:
for file_Name in tqdm(os.listdir(path), desc="Carregar arquivos"):    
    if file_Name.endswith(".txt"):
        file_path = f"{path}\{file_Name}"
      
        # call read text file function
        read_text(file_path, file_Name)

Carregar arquivos: 100%|███████████████████████████████████████████████████████████| 299/299 [00:00<00:00, 1223.68it/s]


In [7]:
textDataDrame =  pd.DataFrame({"Text":content_file_array, "Class":classifier_array})


In [8]:
textDataDrame.head(10)

Unnamed: 0,Text,Class
0,Nasce um novo amiguinho virtual\n<b> Coelhinho...,inf
1,Brigas na saída de boates\n<b> A denúncia é de...,poc
2,Queijo fica perto do livro dos recordes\n<b> O...,poc
3,Queijo fica perto do livro dos recordes\n<b> O...,reg
4,PLENÁRIO\n<b> </b><br>\n <br>\nBolo social do...,pot
5,Quadrilha assalta lanchonete\n<b> Cinco bandid...,poc
6,CARTAS\n<b> </b><br>\n <br>\nParabéns\nÀ Rede...,opi
7,Mineiro é baleado em assalto \n<b> O gerente c...,poc
8,CARTAS\n<b> </b><br>\n <br>\nParabéns\nVenho ...,opi
9,Aluno é morto com um soco\n<b> Adolescente de ...,poc


# StopWords removing

In [9]:
#!conda install unidecode

import nltk
import unidecode
from bs4 import BeautifulSoup

# nltk.download()
#- averaged_perceptron_tagger
# - floresta
# - mac_morpho
# - machado
# - punkt
# - stopwords
# - wordnet
# - words

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [10]:
def addStopWordsToList(texto):
    example_sent = texto
    word_tokens = word_tokenize(example_sent)

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    filtered_sentence = ""

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence += w + " "
    return filtered_sentence.strip()

In [11]:
stop_words = set(stopwords.words('portuguese'))
def returnDataFrameStopWords(dataFrame):
    dataReturn = pd.DataFrame()
    for row in tqdm(dataFrame.itertuples(index=True, name='Pandas'), desc="Limpeza de texto"):
        Text = row.Text
        Class = row.Class

        Text = BeautifulSoup(Text, "lxml").text
        Text = unidecode.unidecode(Text)
        Text = addStopWordsToList(Text)
        Text = Text.replace(" . "," ")
        Text = Text.replace(" , "," ")
        Text = Text.replace(" - "," ")
        Text = Text.replace(" : "," ")
        Text = Text.lower()

        linha = pd.Series([Text, Class])
        linha = pd.DataFrame([linha])
        dataReturn = pd.concat([linha, dataReturn], ignore_index=True)

    return dataReturn

In [12]:
df_text_clean = returnDataFrameStopWords(textDataDrame)

Limpeza de texto: 297it [00:01, 231.84it/s]


## Fatores de Ponderação

In [13]:
df_text_clean = df_text_clean.rename(columns = {0: 'Text', 1: 'Class'}, inplace = False)
df_text_clean.head()

Unnamed: 0,Text,Class
0,concursos vagas nivel superior o salario passa...,eco
1,o caminho diminuir criminalidade ... so existe...,opi
2,estagiario rouba bolsas policiais um adolescen...,poc
3,planeta pop robin williams adota filho super-h...,at2
4,concursos vagas nivel superior o salario passa...,eco


In [14]:
from sklearn.feature_extraction.text import TfidfTransformer # tf-idf
from sklearn.feature_extraction.text import CountVectorizer # tf

In [15]:
classes = []
for row in df_text_clean.itertuples(index=True, name='Pandas'):  
    if row.Class not in classes:
        classes.append(row.Class)
classes

['eco',
 'opi',
 'poc',
 'at2',
 'int',
 'ept',
 'bro',
 'reg',
 'pot',
 'inf',
 'cid',
 'cit',
 'sro']

In [16]:
 classes =  ["at2","sro","inf","imo","ept","esp","fam","tvt","cid","mul","pot","tav","mic","opi","poc","reg","eco","bro","cit","con","int"]

## TF

In [17]:
vectorizer = CountVectorizer()
TF = vectorizer.fit_transform(df_text_clean.Text)
features_names =  vectorizer.get_feature_names()

## TF-IDF

In [18]:
from sklearn.pipeline import Pipeline

In [19]:
pipe = Pipeline([('count', CountVectorizer(vocabulary=features_names)),
                 ('tfid', TfidfTransformer())]).fit(df_text_clean.Text)

In [20]:
lista_palavras = []
for row in tqdm(df_text_clean.itertuples(index=True, name='Pandas')):
    palavras = row.Text.split(" ")
    for palavra in palavras:
        if(not len(palavra) <= 2):
            if(palavra not in lista_palavras):
                lista_palavras.append(palavra)

297it [00:01, 171.36it/s]


# frequencia da palavra por documento

In [21]:
df_relacao_doc = pd.DataFrame()
df_relacao_doc['Index'] = lista_palavras

for i in tqdm(df_text_clean.index):
    doc = df_text_clean.iloc[i].Text
    
    lista_quant =[]
    
    for termo in lista_palavras:
        quant = doc.split(" ").count(termo)
        lista_quant.append(quant)
        
    df_relacao_doc[f'{i}'] = lista_quant
    
    
df_relacao_doc = df_relacao_doc.set_index('Index')

100%|████████████████████████████████████████████████████████████████████████████████| 297/297 [00:35<00:00,  8.44it/s]


In [22]:
df_text_clean.head(25)

Unnamed: 0,Text,Class
0,concursos vagas nivel superior o salario passa...,eco
1,o caminho diminuir criminalidade ... so existe...,opi
2,estagiario rouba bolsas policiais um adolescen...,poc
3,planeta pop robin williams adota filho super-h...,at2
4,concursos vagas nivel superior o salario passa...,eco
5,brasil quita toda divida fmi brasilia o govern...,eco
6,cartas evite errar como todo eleitor valoriza ...,opi
7,acidente sete carros vitoria o mau tempo falta...,poc
8,"dupla invade casa rouba r $ 1,2 mil dois homen...",poc
9,planeta pop robin williams adota filho super-h...,at2


In [23]:
df_relacao_doc.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,287,288,289,290,291,292,293,294,295,296
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
concursos,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
vagas,3,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
nivel,5,1,0,0,5,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
superior,5,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
salario,2,0,0,0,2,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


# frequencia palavra por classe

In [24]:
dic_doc_class = {}

for classe in tqdm(classes):
    lista_doc = []
    for row in df_text_clean.itertuples(index=True, name='Pandas'):  
        if (str(classe) == row.Class):
            lista_doc.append(row.Text)
    dic_doc_class[f'{classe}'] = lista_doc

100%|████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 1588.67it/s]


In [25]:
df_relacao_class = pd.DataFrame()
df_relacao_class['Index'] = lista_palavras
for key in tqdm(dic_doc_class):
    lista_docs =  dic_doc_class[key]
    docs = ""
    for doc in lista_docs:
        docs += f' {doc}'
        
    doc = docs
    
    lista_quant =[]
    
    for termo in lista_palavras:
        quant = doc.split(" ").count(termo)
        lista_quant.append(quant)
        
    df_relacao_class[f'{key}'] = lista_quant
    
    
df_relacao_class = df_relacao_class.set_index('Index')

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:36<00:00,  1.75s/it]


In [26]:
df_relacao_class.head()

Unnamed: 0_level_0,at2,sro,inf,imo,ept,esp,fam,tvt,cid,mul,...,tav,mic,opi,poc,reg,eco,bro,cit,con,int
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
concursos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,7,1,0,0,0
vagas,0,0,0,0,0,0,0,0,2,0,...,0,0,0,5,3,26,0,0,0,0
nivel,0,0,0,0,1,0,0,0,1,0,...,0,0,3,0,0,36,0,0,0,2
superior,0,1,0,0,2,0,0,0,1,0,...,0,0,0,4,0,42,0,0,0,1
salario,0,0,0,0,1,0,0,0,0,0,...,0,0,2,1,0,18,0,0,0,0


## ICF

In [27]:
import math 

In [28]:
def get_row_class(row, classe):
    dic_classe = {
        'at2':row.at2,
        'sro':row.sro,
        'inf':row.inf,
        'imo':row.imo,
        'ept':row.ept,
        'esp':row.esp,
        'fam':row.fam,
        'tvt':row.tvt,
        'cid':row.cid,
        'mul':row.mul,
        'pot':row.pot,
        'tav':row.tav,
        'mic':row.mic,
        'opi':row.opi,
        'poc':row.poc,
        'reg':row.reg,
        'eco':row.eco,
        'bro':row.bro,
        'cit':row.cit,
        'con':row.con,
        'int':row.int
    }
    return dic_classe[f'{classe}']
    

In [29]:
# conta quantas classes o termo ki aparece
def num_ki_em_categoria():

    dic_num_ki_em_categoria ={}
    for row in tqdm(df_relacao_class.itertuples(index=True, name='Pandas')):
        ci = 0
        for classe in classes:
            if get_row_class(row,classe)!= 0:
                ci += 1
        dic_num_ki_em_categoria[f'{row.Index}'] = ci
        
    return dic_num_ki_em_categoria

In [30]:
def calculeteICF():
    dic_num_ki_em_categoria = num_ki_em_categoria()
    icfs = {}
    C = len(classes)
    for key in dic_num_ki_em_categoria:
        Ci = dic_num_ki_em_categoria[key]
        valor = math.log((1 + (C/Ci)))
        icfs[f'{key}'] = valor
    return icfs

In [31]:
icfs = calculeteICF() 
icfs

12569it [00:00, 40688.02it/s]


{'concursos': 2.0794415416798357,
 'vagas': 1.8325814637483102,
 'nivel': 1.6486586255873816,
 'superior': 1.3862943611198906,
 'salario': 1.6486586255873816,
 'passar': 1.2878542883066382,
 'concurso': 1.6486586255873816,
 'ministerio': 1.5040773967762742,
 'relacoes': 2.0794415416798357,
 'exteriores': 2.0794415416798357,
 '2,63': 3.091042453358316,
 'mil': 1.1314021114911006,
 'segundo': 1.2878542883066382,
 'edital': 3.091042453358316,
 'candidatos': 1.8325814637483102,
 'qualquer': 1.2039728043259361,
 'area': 1.2878542883066382,
 'formacao': 1.8325814637483102,
 'busca': 1.6486586255873816,
 'vaga': 1.6486586255873816,
 'servico': 1.1314021114911006,
 'publico': 1.2039728043259361,
 'federal': 1.3862943611198906,
 'podem': 1.2878542883066382,
 'inscrever': 3.091042453358316,
 'partir': 1.067840630001356,
 'hoje': 1.067840630001356,
 'selecao': 1.5040773967762742,
 'visa': 3.091042453358316,
 'preenchimento': 2.4423470353692043,
 'oficial': 1.5040773967762742,
 'chancelaria': 3.09

In [32]:
df_icf = pd.DataFrame()
df_icf['Index'] = df_text_clean.index

for termo in tqdm(lista_palavras):
    
    list_icfs = []
    
    for i in df_text_clean.index:
        doc = df_text_clean.iloc[i].Text
        if termo in doc.split(" "):
            list_icfs.append(icfs[termo])
        else:
            list_icfs.append(0)
    
    df_icf[f'{termo}'] = list_icfs
    
df_icf = df_icf.set_index('Index')

100%|████████████████████████████████████████████████████████████████████████████| 12569/12569 [06:01<00:00, 34.75it/s]


In [33]:
df_icf

Unnamed: 0_level_0,concursos,vagas,nivel,superior,salario,passar,concurso,ministerio,relacoes,exteriores,...,estaveis,boletins,leitura,chamadas,telefonicas,oferecemos,sofisticados,inteligentes,violet,14/08/2006
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.079442,1.832581,1.648659,1.386294,1.648659,1.287854,1.648659,1.504077,2.079442,2.079442,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,1.648659,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2.079442,1.832581,1.648659,1.386294,1.648659,1.287854,1.648659,1.504077,2.079442,2.079442,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.000000,0.000000,1.287854,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [34]:
for i in df_text_clean.index:
    doc = df_text_clean.iloc[i].Text
    if 'proximo' in doc.split(" "):
        print('*********************** \r\n')
        print(doc)
        print(i)
        print('*********************** \r\n')

*********************** 

concursos vagas nivel superior o salario passar concurso ministerio relacoes exteriores r $ 2,63 mil segundo edital candidatos nivel superior qualquer area formacao busca vaga servico publico federal podem inscrever partir hoje concurso ministerio relacoes exteriores a selecao visa preenchimento 66 vagas oficial chancelaria quais quatro destinadas candidatos portadores deficiencia a remuneracao oferecida r $ 2.633 41 os interessados podem inscrever via internet endereco www.cespe.unb.br/concursos/mre2006 10 horas hoje 23h59 proximo dia 30 para efetuar inscricao necessario numero cpf candidato o valor taxa r $ 65,00 pagamento deve ser feito meio debito conta corrente ( apenas correntistas banco brasil ) documento arrecadacao ( pagavel qualquer loterica ) boleto bancario ( pagavel toda rede bancaria ) o documento arrecadacao boleto bancario estarao disponiveis endereco inscricoes deverao ser impressos pagamento taxa apos conclusao preenchimento ficha solicitaca

In [35]:
teste = pd.DataFrame.from_dict(icfs, orient='index')

In [36]:
teste.head()

Unnamed: 0,0
concursos,2.079442
vagas,1.832581
nivel,1.648659
superior,1.386294
salario,1.648659


## Machine Learning

### KNN

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve,classification_report,r2_score
from sklearn import model_selection as ms
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [38]:
le = LabelEncoder()

In [39]:
def toNumeric(nome_coluna, coluna):
    df_text_clean[nome_coluna] = le.fit_transform (coluna)

In [40]:
toNumeric('numeric_class',df_text_clean.Class)

In [41]:
# features
X = TF

In [42]:
#response
Y = df_text_clean['numeric_class'].copy()

In [43]:
x_train, x_test, y_train, y_test = ms.train_test_split(X,Y,test_size=0.33, random_state=5)

In [44]:
model_knn = KNeighborsClassifier(n_neighbors=1)

In [45]:
model_knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [46]:
pred = model_knn.predict_proba(x_test)

In [47]:
plot_roc_curve(np.array(y_test), pred[:,1])

NameError: name 'plot_roc_curve' is not defined

# Imports do EDA

In [None]:
# print da curva ROC
def plot_roc_curve(y_true, y_score, figsize=(10,6)):
    fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=2)
    plt.figure(figsize=figsize)
    auc_value = roc_auc_score(y_true, y_score)
    plt.plot(fpr, tpr, color='orange', label='ROC curve (area = %0.2f)' % auc_value)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('Taxa de falso positivo')
    plt.ylabel('Taxa de verdadeiro positivo')
    plt.title('Curva ROC')
    plt.legend()
    lt.show()

In [None]:
#precision    recall  f1-score   support
print(classification_report(y_test, pred, labels=[1, 2, 3]))

In [None]:
# AUC
print('AUC: %0.2f' % roc_auc_score(y_test, pred))

## Número de amostras de cade classe


In [None]:
import seaborn as sns

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.countplot(x = 'Class',data = textDataDrame)