## Experimentos de classificação com a planilha de oportunidades em texto

https://github.com/mcti-sefip/mcti-sefip-ppfcd2020/blob/jairoalves-task_18/Processamento%20Planilha%20Categoriza%C3%A7%C3%A3o.ipynb

In [99]:
import pandas as pd
import re
import numpy as np

In [100]:
arq = r"C:\Users\bolin\Desktop\codigos\mcti-sefip-ppfcd2020\oportunidades_classificacao_3.xlsx"

In [101]:
df = pd.read_excel(arq)

In [102]:
df.head(4)

Unnamed: 0,opo_titulo,link,opo_brazil,opo_deadline,codigo,opo_texto,opo_texto_ele,opo_tipo,atualizacao,clas,comentario
0,Knowledge Product to Strengthen Women's Voices,https://www.cepf.net/grants/open-calls-for-pro...,N,30 April 2021,cepf_210429_01_000,gender-cfp-2021.jpg Caption: Interviewing a c...,gender-cfp-2021.jpg Caption: Interviewing a c...,other,210429,N,
1,Bolsas de PD em Política Científica e Tecnológica,http://fapesp.br/oportunidades/Control/../inov...,Y,30/04/2021,fapesp_210429_1_000,Unicamp's Department of Scientific and Technol...,Unicamp's Department of Scientific and Technol...,scholarship,210429,Y,
2,Bolsa de TT-II em Ciência de Dados,http://fapesp.br/oportunidades/Control/../fish...,Y,30/04/2021,fapesp_210429_1_001,The vacancy is for graduates of a Technical Co...,The vacancy is for graduates of a Technical Co...,scholarship,210429,Y,
3,Bolsa de PD em História da Filosofia Moderna,http://fapesp.br/oportunidades/Control/../pode...,Y,30/04/2021,fapesp_210429_1_002,The scholarship lasts for two years and can be...,The scholarship lasts for two years and can be...,scholarship,210429,Y,


### Variáveis de interesse

In [103]:
# Coleta das variáveis de interesse
X = df[['opo_texto']].copy()
y = df[['clas']].copy()
# y = [re.sub(" ","",i) for i in str(y)]

### Pré-processamento

In [104]:
class Sentenca():

    def __init__(self, sentenca):
        self.sent_bruta = sentenca
        self.preproc()
    
    def remove_caracteres_nao_alfanumericos(self):
        # padroes para trechos nao alfanumericos
        ptn_nao_alfanum = r"[\W+]"
        self.sent_preproc = re.sub(ptn_nao_alfanum, ' ', self.sent_bruta)

    def remove_a_chapeu(self):
        # padroes para trechos nao alfanumericos
        ptn_nao_chap = r"(Â|â|œ)"
        self.sent_preproc = re.sub(ptn_nao_chap, ' ', self.sent_preproc)
    
    def remove_espacos_multiplos(self):
        ptn_espacos_mult = r"\s+"  
        self.sent_preproc = re.sub(ptn_espacos_mult, ' ', self.sent_preproc)
        self.sent_preproc = self.sent_preproc.strip()
    
    def remove_b_inicial(self):
        if self.sent_preproc.startswith('b '):
            self.sent_preproc = self.sent_preproc[2:]
    
    def separa_palavras_coladas(self):
        """Separa com espaço palavras coladas, aqui definido quando uma letra 
        minúscula está colada com uma maiúscula imediatalmente posterior"""
        ptn_ltr_minusc_colada_maiuscula = r'([a-z])([A-Z])'
        ptn_algarismo_colado_maiuscula = r'([0-9])([A-Z])'
        
        self.sent_preproc = re.sub(ptn_ltr_minusc_colada_maiuscula, r'\1 \2', self.sent_preproc)
        self.sent_preproc = re.sub(ptn_algarismo_colado_maiuscula, r'\1 \2', self.sent_preproc)
    
    def preproc(self):
        self.sent_preproc = ''
        self.remove_caracteres_nao_alfanumericos()
        self.remove_a_chapeu()
        self.remove_espacos_multiplos()
        self.remove_b_inicial()
        self.separa_palavras_coladas()
        self.sent_preproc = self.sent_preproc.lower()
        
        return self.sent_preproc
    
    def __getitem__(self, indices):
        return ''.join(self.sent_preproc[indices])
    
    def __str__(self):
        return str(self.sent_preproc)
    
    def __repr__(self):
        return self.sent_preproc

In [105]:
X['opo_texto_preproc'] = X['opo_texto'].apply(Sentenca)

In [106]:
X['opo_texto_preproc'].iloc[0][:306]

'gender cfp 2021 jpg caption interviewing a community member in lao pdr credit fishbio call for proposals development of an innovative knowledge product pertaining to the strengthening of women s voices in conservation opening date 22 march 2021 closing date 30 april 2021 questions due date 15 april 2021 s'

In [107]:
X.head(4)

Unnamed: 0,opo_texto,opo_texto_preproc
0,gender-cfp-2021.jpg Caption: Interviewing a c...,gender cfp 2021 jpg caption interviewing a com...
1,Unicamp's Department of Scientific and Technol...,unicamp s department of scientific and technol...
2,The vacancy is for graduates of a Technical Co...,the vacancy is for graduates of a technical co...
3,The scholarship lasts for two years and can be...,the scholarship lasts for two years and can be...


In [108]:
### Stopwords

In [109]:
lista_texto = X['opo_texto_preproc'].tolist()
lista_texto = [str(i) for i in lista_texto]
# print(type((lista_texto[0])))
values = ','.join(str(v) for v in lista_texto)
unique_words = set(values.split(' '))
unique_words = sorted(unique_words)

import enchant # pip install pyenchant
def frase_dicionario(frase):
    d = enchant.Dict("en_GB")
    jus = frase.split(' ')
    lista_frase_dic = [i for i in jus if d.check(i)]
    frase_final = ' '.join(lista_frase_dic)
    return(frase_final)

opo_texto_dicio=[]
for i in lista_texto: # não funcionou com listcompreension
    opo_texto_dicio.append(frase_dicionario(i))

X['opo_texto_dicio'] = opo_texto_dicio
X.head(4)


Unnamed: 0,opo_texto,opo_texto_preproc,opo_texto_dicio
0,gender-cfp-2021.jpg Caption: Interviewing a c...,gender cfp 2021 jpg caption interviewing a com...,gender 2021 caption interviewing a community m...
1,Unicamp's Department of Scientific and Technol...,unicamp s department of scientific and technol...,s department of scientific and technological p...
2,The vacancy is for graduates of a Technical Co...,the vacancy is for graduates of a technical co...,the vacancy is for graduates of a technical co...
3,The scholarship lasts for two years and can be...,the scholarship lasts for two years and can be...,the scholarship lasts for two years and can be...


In [110]:
import nltk
#nltk.download('stopwords')

In [111]:
from nltk.corpus import stopwords

In [112]:
stop_ingles = stopwords.words('english')

In [113]:
def remove_stopwords(sentenca):
    tokens = str(sentenca).split(' ')
    tokens_sem_stops = [token for token in tokens if token not in stop_ingles]
    return ' '.join(tokens_sem_stops)

In [114]:
# Troquei aqui para o dicio 
X['opo_texto_sem_stop'] = X['opo_texto_dicio'].apply(remove_stopwords)

In [115]:
X.head(4)

Unnamed: 0,opo_texto,opo_texto_preproc,opo_texto_dicio,opo_texto_sem_stop
0,gender-cfp-2021.jpg Caption: Interviewing a c...,gender cfp 2021 jpg caption interviewing a com...,gender 2021 caption interviewing a community m...,gender 2021 caption interviewing community mem...
1,Unicamp's Department of Scientific and Technol...,unicamp s department of scientific and technol...,s department of scientific and technological p...,department scientific technological policy por...
2,The vacancy is for graduates of a Technical Co...,the vacancy is for graduates of a technical co...,the vacancy is for graduates of a technical co...,vacancy graduates technical course student las...
3,The scholarship lasts for two years and can be...,the scholarship lasts for two years and can be...,the scholarship lasts for two years and can be...,scholarship lasts two years renewed another ye...


In [116]:
print(X['opo_texto_sem_stop'][0][:302], '\n...')

gender 2021 caption interviewing community member credit call proposals development innovative knowledge product pertaining strengthening women voices conservation opening date 22 march 2021 closing date 30 2021 questions due date 15 2021 submissions applications sent net closing date overview intends 
...


### Tokenização

In [117]:
#import nltk
#nltk.download('punkt')

In [118]:
from nltk.tokenize import word_tokenize

In [119]:
X['opo_texto_tokens'] = X['opo_texto_sem_stop'].apply(word_tokenize)

In [120]:
print(X['opo_texto_tokens'].iloc[0][:50], '\n...')

['gender', '2021', 'caption', 'interviewing', 'community', 'member', 'credit', 'call', 'proposals', 'development', 'innovative', 'knowledge', 'product', 'pertaining', 'strengthening', 'women', 'voices', 'conservation', 'opening', 'date', '22', 'march', '2021', 'closing', 'date', '30', '2021', 'questions', 'due', 'date', '15', '2021', 'submissions', 'applications', 'sent', 'net', 'closing', 'date', 'overview', 'intends', 'engage', 'consultant', 'develop', 'knowledge', 'product', 'provide', 'guidance', 'strengthen', 'women', 'voices'] 
...


### Lematização

In [121]:
#import nltk
#nltk.download('wordnet')

In [122]:
from nltk.stem import WordNetLemmatizer

In [123]:
wordnet = WordNetLemmatizer()

In [124]:
def lematiza_tokens(tokens):
    return [wordnet.lemmatize(token) for token in tokens]

In [125]:
# lematização dos tokens
X['opo_texto_tokens_lem'] = X['opo_texto_tokens'].apply(lematiza_tokens)
X['opo_texto_sem_stop_lem'] = X['opo_texto_tokens_lem'].apply(lambda l: ' '.join(l))

Tokenização e Lematização:

In [126]:
X[['opo_texto_tokens', 'opo_texto_tokens_lem', 'opo_texto_sem_stop_lem']].head(10)

Unnamed: 0,opo_texto_tokens,opo_texto_tokens_lem,opo_texto_sem_stop_lem
0,"[gender, 2021, caption, interviewing, communit...","[gender, 2021, caption, interviewing, communit...",gender 2021 caption interviewing community mem...
1,"[department, scientific, technological, policy...","[department, scientific, technological, policy...",department scientific technological policy por...
2,"[vacancy, graduates, technical, course, studen...","[vacancy, graduate, technical, course, student...",vacancy graduate technical course student last...
3,"[scholarship, lasts, two, years, renewed, anot...","[scholarship, last, two, year, renewed, anothe...",scholarship last two year renewed another year...
4,"[registration, open, selection, one, 01, schol...","[registration, open, selection, one, 01, schol...",registration open selection one 01 scholarship...
5,"[technical, training, scholarship, level, iv, ...","[technical, training, scholarship, level, iv, ...",technical training scholarship level iv iv ava...
6,"[candidate, desired, skills, experience, molec...","[candidate, desired, skill, experience, molecu...",candidate desired skill experience molecular b...
7,"[prerequisites, graduation, biology, zootechni...","[prerequisite, graduation, biology, zootechnic...",prerequisite graduation biology zootechnics fi...
8,"[consultancy, specialized, application, develo...","[consultancy, specialized, application, develo...",consultancy specialized application developmen...
9,"[collaborative, project, within, department, b...","[collaborative, project, within, department, b...",collaborative project within department bioche...


### Bag of Words

In [127]:
from collections import Counter

In [128]:
X['opo_texto_bow'] = X['opo_texto_tokens'].apply(Counter)
X['opo_texto_bow_lem'] = X['opo_texto_tokens_lem'].apply(Counter)

Bag of words com e sem lematização

In [129]:
X[['opo_texto_tokens', 'opo_texto_bow', 'opo_texto_bow_lem']].head(10)

Unnamed: 0,opo_texto_tokens,opo_texto_bow,opo_texto_bow_lem
0,"[gender, 2021, caption, interviewing, communit...","{'gender': 1, '2021': 4, 'caption': 1, 'interv...","{'gender': 1, '2021': 4, 'caption': 1, 'interv..."
1,"[department, scientific, technological, policy...","{'department': 1, 'scientific': 1, 'technologi...","{'department': 1, 'scientific': 1, 'technologi..."
2,"[vacancy, graduates, technical, course, studen...","{'vacancy': 1, 'graduates': 1, 'technical': 1,...","{'vacancy': 1, 'graduate': 1, 'technical': 1, ..."
3,"[scholarship, lasts, two, years, renewed, anot...","{'scholarship': 4, 'lasts': 1, 'two': 1, 'year...","{'scholarship': 4, 'last': 1, 'two': 1, 'year'..."
4,"[registration, open, selection, one, 01, schol...","{'registration': 2, 'open': 1, 'selection': 1,...","{'registration': 3, 'open': 1, 'selection': 1,..."
5,"[technical, training, scholarship, level, iv, ...","{'technical': 1, 'training': 1, 'scholarship':...","{'technical': 1, 'training': 1, 'scholarship':..."
6,"[candidate, desired, skills, experience, molec...","{'candidate': 1, 'desired': 2, 'skills': 1, 'e...","{'candidate': 1, 'desired': 2, 'skill': 1, 'ex..."
7,"[prerequisites, graduation, biology, zootechni...","{'prerequisites': 1, 'graduation': 1, 'biology...","{'prerequisite': 1, 'graduation': 1, 'biology'..."
8,"[consultancy, specialized, application, develo...","{'consultancy': 1, 'specialized': 1, 'applicat...","{'consultancy': 1, 'specialized': 1, 'applicat..."
9,"[collaborative, project, within, department, b...","{'collaborative': 1, 'project': 1, 'within': 1...","{'collaborative': 1, 'project': 1, 'within': 1..."


### Mapeamento do Corpus em Dicionário

Vamos passar a usar números para representar cada token, por meio da criação de um `dicionario_corpus`.

In [130]:
from gensim.corpora.dictionary import Dictionary

In [131]:
dicionario_corpus = Dictionary(X['opo_texto_tokens'].tolist() + X['opo_texto_tokens_lem'].tolist())

Resultado do mapeamento:

In [132]:
print('Dicionario do corpus:\n\n',
      {k: v for i, (k, v) in enumerate(dicionario_corpus.token2id.items()) if i < 80}, '\n...', sep='')

Dicionario do corpus:

{'15': 0, '2021': 1, '22': 2, '238': 3, '30': 4, 'applications': 5, 'call': 6, 'caption': 7, 'closing': 8, 'community': 9, 'conservation': 10, 'consultant': 11, 'credit': 12, 'date': 13, 'develop': 14, 'development': 15, 'due': 16, 'engage': 17, 'english': 18, 'gender': 19, 'guidance': 20, 'information': 21, 'innovative': 22, 'intends': 23, 'interviewing': 24, 'kb': 25, 'knowledge': 26, 'march': 27, 'member': 28, 'net': 29, 'opening': 30, 'overview': 31, 'pertaining': 32, 'product': 33, 'proposals': 34, 'provide': 35, 'questions': 36, 'sent': 37, 'strengthen': 38, 'strengthening': 39, 'submissions': 40, 'voices': 41, 'women': 42, '1': 43, '10': 44, '2': 45, '202': 46, '3': 47, '373': 48, '378': 49, '4': 50, '5': 51, '553': 52, '6230': 53, '7': 54, 'activity': 55, 'amount': 56, 'announces': 57, 'annual': 58, 'application': 59, 'available': 60, 'based': 61, 'big': 62, 'br': 63, 'calls': 64, 'catching': 65, 'chair': 66, 'communicate': 67, 'contact': 68, 'convergence

Exemplo de consulta ao dicionário:

In [133]:
dicionario_corpus.token2id['grant']

1013

In [134]:
dicionario_corpus.get(90)

'governance'

### Bag of Words com Dicionário

Vamos criar duas novas colunas fazendo `bag of words` de pares de inteiros para o texto normal e para o lematizado.
O primeiro elemento deste par é o `id` do token no `dicionario_corpus` e o segundo elemento é a contagem de ocorrências deste token no documento.

Estamos convencionando chamar as colunas inteiras de `'opo_int_...'`

In [135]:
# Criação dos bag of words para o texto normal e lematizado
X['opo_int_bow'] = X['opo_texto_tokens'].apply(dicionario_corpus.doc2bow)
X['opo_int_bow_lem'] = X['opo_texto_tokens_lem'].apply(dicionario_corpus.doc2bow)

Resultado dos bag of words após mapeamento em dicionário

In [136]:
X[['opo_texto_tokens', 'opo_int_bow', 'opo_int_bow_lem']].head(15)

Unnamed: 0,opo_texto_tokens,opo_int_bow,opo_int_bow_lem
0,"[gender, 2021, caption, interviewing, communit...","[(0, 1), (1, 4), (2, 1), (3, 1), (4, 1), (5, 1...","[(0, 1), (1, 4), (2, 1), (3, 1), (4, 1), (6, 2..."
1,"[department, scientific, technological, policy...","[(1, 1), (6, 1), (21, 1), (43, 2), (44, 2), (4...","[(1, 1), (6, 2), (21, 1), (43, 2), (44, 2), (4..."
2,"[vacancy, graduates, technical, course, studen...","[(15, 1), (21, 2), (61, 3), (63, 1), (70, 2), ...","[(15, 1), (21, 2), (55, 1), (61, 3), (63, 1), ..."
3,"[scholarship, lasts, two, years, renewed, anot...","[(44, 3), (48, 1), (54, 1), (55, 1), (56, 1), ...","[(44, 3), (48, 1), (54, 1), (55, 3), (56, 1), ..."
4,"[registration, open, selection, one, 01, schol...","[(26, 1), (37, 2), (43, 2), (45, 2), (47, 3), ...","[(26, 1), (37, 2), (43, 2), (45, 2), (47, 3), ..."
5,"[technical, training, scholarship, level, iv, ...","[(15, 3), (60, 1), (63, 1), (124, 2), (132, 1)...","[(15, 3), (60, 1), (63, 1), (124, 2), (132, 1)..."
6,"[candidate, desired, skills, experience, molec...","[(18, 1), (26, 1), (44, 2), (47, 1), (48, 1), ...","[(18, 1), (26, 1), (44, 2), (47, 1), (48, 1), ..."
7,"[prerequisites, graduation, biology, zootechni...","[(26, 1), (63, 1), (70, 1), (124, 2), (140, 1)...","[(26, 1), (63, 1), (70, 1), (124, 2), (140, 1)..."
8,"[consultancy, specialized, application, develo...","[(15, 1), (20, 1), (44, 1), (59, 1), (61, 1), ...","[(15, 1), (20, 1), (44, 1), (55, 1), (59, 1), ..."
9,"[collaborative, project, within, department, b...","[(15, 1), (63, 2), (71, 1), (77, 1), (93, 1), ...","[(15, 1), (55, 1), (63, 2), (71, 1), (77, 1), ..."


### TF-IDF

Term Frequency - Inverse Document Frequency

In [137]:
from gensim.models.tfidfmodel import TfidfModel

In [138]:
def tfdif_palavras_mais_representativas(col_tfidf, dicionario, top=5):
    palavras_mais_repr = []
    for idx, tfidf_doc in enumerate(col_tfidf):
        palavras =[]
        # lista ordenada pelo peso tfidf do termo
        tfidf_desc = sorted(tfidf_doc, key=lambda termo: termo[1], reverse=True)

        # lista no tamanho especificado
        tfidf_desc_tam = tfidf_desc[:top]

        # conversão dos tokenids para palavras
        palavras = [(dicionario.get(tokenid), peso) for tokenid, peso in tfidf_desc_tam]
        palavras_mais_repr.append({f'Palavra_Rank_{rank + 1}': palavras[rank] for rank in range(len(palavras))})

    return pd.DataFrame(palavras_mais_repr)

In [139]:
def gera_tfidf_mais_representativos(serie_int_bow, tam=10):
    """Gera um dataframe com os dados de tfidf
    para os `tam` tokens mais representativos de cada documento"""
    
    corpus = serie_int_bow.to_list()
    tfidf = TfidfModel(corpus=corpus)
    
    tfidf_docs = []
    for idx, doc in enumerate(corpus):
        tfidf_doc = tfidf[doc]
    
        # lista ordenada pelo peso tfidf do termo
        tfidf_desc = sorted(tfidf_doc, key=lambda termo: termo[1], reverse=True)

        # lista no tamanho especificado
        tfidf_desc_tam = tfidf_desc[:tam]

        tfidf_docs.append({f'tdidf_desc_tam_{tam}': tfidf_desc_tam})
        
    return pd.DataFrame(tfidf_docs)

In [140]:
# Gera colunas com os tfidfs para cada documento
X['opo_int_tfidf'] = gera_tfidf_mais_representativos(X['opo_int_bow'], tam=30)
X['opo_int_tfidf_lem'] = gera_tfidf_mais_representativos(X['opo_int_bow_lem'], tam=30)

Resultado do TF-IDF para o corpus normal e o lematizado

In [141]:
X[['opo_int_tfidf', 'opo_int_tfidf_lem']].head(4)

Unnamed: 0,opo_int_tfidf,opo_int_tfidf_lem
0,"[(41, 0.36756587588835876), (33, 0.33520330695...","[(5064, 0.38837366192956346), (33, 0.311098952..."
1,"[(128, 0.34398110623892564), (92, 0.3060948373...","[(128, 0.3581655100099124), (92, 0.31421016951..."
2,"[(164, 0.45424157947496496), (198, 0.287162733...","[(164, 0.4642261881533202), (198, 0.2934748096..."
3,"[(259, 0.25464027448556886), (124, 0.191327449...","[(259, 0.26564610890151963), (226, 0.179891246..."


Checando as palavras mais importantes por documento, segundo seu TF-IDF

In [142]:
tfdif_palavras_mais_representativas(X['opo_int_tfidf_lem'], dicionario_corpus, top=8).head(6)

Unnamed: 0,Palavra_Rank_1,Palavra_Rank_2,Palavra_Rank_3,Palavra_Rank_4,Palavra_Rank_5,Palavra_Rank_6,Palavra_Rank_7,Palavra_Rank_8
0,"(voice, 0.38837366192956346)","(product, 0.3110989527314427)","(conservation, 0.2827228125679771)","(238, 0.22341486140644695)","(caption, 0.22341486140644695)","(interviewing, 0.22341486140644695)","(credit, 0.19418683096478173)","(engage, 0.19418683096478173)"
1,"(spec, 0.3581655100099124)","(innovation, 0.3142101695125779)","(technological, 0.249367733120263)","(policy, 0.17954866829290167)","(202, 0.1790827550049562)","(378, 0.1790827550049562)","(553, 0.1790827550049562)","(6230, 0.1790827550049562)"
2,"(fish, 0.4642261881533202)","(slaughterhouse, 0.2934748096072516)","(trader, 0.2934748096072516)","(mathematical, 0.1659341820794618)","(industry, 0.15583535627011538)","(agrarian, 0.1467374048036258)","(codification, 0.1467374048036258)","(debate, 0.1467374048036258)"
3,"(philosophy, 0.26564610890151963)","(chapter, 0.17989124683007748)","(specifying, 0.17989124683007748)","(studied, 0.17989124683007748)","(table, 0.17989124683007748)","(translated, 0.17989124683007748)","(variation, 0.17989124683007748)","(required, 0.1775346207759335)"
4,"(computing, 0.41388045847819366)","(30982, 0.19815141229263444)","(updating, 0.19815141229263444)","(scholarship, 0.1904302283248926)","(br, 0.17245963182909174)","(automation, 0.1722284478394696)","(diploma, 0.1722284478394696)","(script, 0.1722284478394696)"
5,"(iv, 0.3744760568493732)","(back, 0.2907742650502568)","(proficient, 0.21326815311297048)","(prototype, 0.21326815311297048)","(refactoring, 0.21326815311297048)","(scheduling, 0.21326815311297048)","(press, 0.18536755584659836)","(rest, 0.18536755584659836)"


### Conjuntos de Treinamento e de Teste

In [143]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X['opo_texto_sem_stop'], y, test_size=0.33)

In [145]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:50])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:15])

# Create the CountVectorizer DataFrame: count_df
#count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_df

['00', '000', '001', '01', '012', '018', '019', '02', '025', '03', '04', '040', '05', '057', '06', '0663', '07', '08', '09', '093', '10', '100', '104', '11', '110', '113', '12', '120', '12120', '123', '125', '12549', '12689', '129', '13', '131', '14', '140', '148', '15', '150', '151', '153', '16', '165', '17', '17th', '18', '180', '189']
[[0.         0.02316647 0.         ... 0.         0.         0.        ]
 [0.         0.00924536 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.08391568 0.01646339 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


Unnamed: 0,00,000,001,01,012,018,019,02,025,03,...,yearly,years,yen,yields,york,young,youth,yr,zone,zootechnics
0,0.000000,0.023166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.015444,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.009245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.026855,0.018491,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.018653,0.027752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.011894,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.000000,0.015230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.030460,0.0,0.0,0.0,0.027722,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,0.000000,0.039211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
130,0.018948,0.036245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
131,0.000000,0.056008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.037339,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
132,0.000000,0.014802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.044405,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


### Métricas

In [146]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [147]:
def avalia_resultado(y_test, y_pred):
    print(f' Acurácia:\t{100 * accuracy_score(y_test, y_pred):.2f} %')
    print(" Matriz de Confusão:\n", confusion_matrix(y_test, y_pred, labels=['N', 'Y']))
    print(" Relatório de classificação:\n", classification_report(y_test, y_pred, labels=['N', 'Y']))

### Divisão dos dados

In [148]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [149]:
from sklearn.naive_bayes import MultinomialNB

In [150]:
#X_train, X_test, y_train, y_test = train_test_split(X['opo_texto_sem_stop'], y, test_size=0.33)
X_train, X_test, y_train, y_test = train_test_split(
    X['opo_texto_sem_stop_lem'], y, stratify=y, test_size=0.25) #

In [151]:
print('Distribuição de classes')
dist_classes = pd.DataFrame({'Treino': dict(Counter(y_train)), 'Teste': dict(Counter(y_test))})
dist_classes['Treino%'] = dist_classes['Treino'].div(dist_classes['Treino'].sum()).mul(100)
dist_classes['Teste%'] = dist_classes['Teste'].div(dist_classes['Teste'].sum()).mul(100)
# dist_classes

Distribuição de classes


Unnamed: 0,Treino,Teste,Treino%,Teste%
clas,1,1,100.0,100.0


### Classificação com Naive Bayes

#### Naive Bayes com Bag of Words

In [152]:
count_vectorizer = CountVectorizer(stop_words="english")
X_bow_train = count_vectorizer.fit_transform(X_train)
X_bow_test = count_vectorizer.transform(X_test)

In [153]:
def classifica_NB_bow_alpha(alpha=1):
    """Classifica TF-DF para diferentes valores de alpha"""
    print(f'\n{"-"*45}')
    print(f'Naive Bayes - BoW')
    print(f'{"-"*45}')
    classificador_bow = MultinomialNB(alpha=alpha)
    classificador_bow.fit(X_bow_train, y_train.values.ravel())
    y_pred_bow = classificador_bow.predict(X_bow_test)
    avalia_resultado(y_test, y_pred_bow)

In [178]:
# Varia o parâmetro alpha para checar qual o melhor
alphas = np.arange(0.01, 1, 0.2)
for alpha in alphas:
    classifica_NB_bow_alpha(alpha=alpha)
    # 0.01, 0.21, 0.41, 0.61, 0.81

[0.01 0.21 0.41 0.61 0.81]

---------------------------------------------
Naive Bayes - BoW
---------------------------------------------
 Acurácia:	82.00 %
 Matriz de Confusão:
 [[15  3]
 [ 6 26]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.71      0.83      0.77        18
           Y       0.90      0.81      0.85        32

    accuracy                           0.82        50
   macro avg       0.81      0.82      0.81        50
weighted avg       0.83      0.82      0.82        50


---------------------------------------------
Naive Bayes - BoW
---------------------------------------------
 Acurácia:	78.00 %
 Matriz de Confusão:
 [[17  1]
 [10 22]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.63      0.94      0.76        18
           Y       0.96      0.69      0.80        32

    accuracy                           0.78        50
   macro avg       0.79   

#### Naive Bayes com TF-IDF

In [155]:
import numpy as np

In [156]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train)
X_tfidf_test = tfidf_vectorizer.transform(X_test)

In [157]:
def classifica_NB_tfidf_alpha(alpha, verb=True):
    """Classifica TF-DF para diferentes valores de alpha"""
    print(f'\n{"-"*55}')
    print(f'Naive Bayes - TF-IDF')
    print(f'{"-"*55}\nAlpha = {alpha:.2f}:\n{"-"*55}')
    classificador_tfidf = MultinomialNB(alpha=alpha)
    classificador_tfidf.fit(X_tfidf_train, y_train.values.ravel())
    y_pred_tfidf = classificador_tfidf.predict(X_tfidf_test)
    if verb:
        avalia_resultado(y_test, y_pred_tfidf)
    
    return classificador_tfidf

In [158]:
# Varia o parâmetro alpha para checar qual o melhor
alphas = np.arange(0.01, 1, 0.2)
alphas

array([0.01, 0.21, 0.41, 0.61, 0.81])

In [159]:
for alpha in alphas:
    classifica_NB_tfidf_alpha(alpha)


-------------------------------------------------------
Naive Bayes - TF-IDF
-------------------------------------------------------
Alpha = 0.01:
-------------------------------------------------------
 Acurácia:	86.00 %
 Matriz de Confusão:
 [[15  3]
 [ 4 28]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.79      0.83      0.81        18
           Y       0.90      0.88      0.89        32

    accuracy                           0.86        50
   macro avg       0.85      0.85      0.85        50
weighted avg       0.86      0.86      0.86        50


-------------------------------------------------------
Naive Bayes - TF-IDF
-------------------------------------------------------
Alpha = 0.21:
-------------------------------------------------------
 Acurácia:	76.00 %
 Matriz de Confusão:
 [[11  7]
 [ 5 27]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.69      0

In [160]:
# Extraindo as características do melhor modelo
# alpha foi 0.01
nb_classifier = classifica_NB_tfidf_alpha(alpha=0.01, verb=False)

labels = nb_classifier.classes_
feature_names = tfidf_vectorizer.get_feature_names()
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

print(f'{labels[0]}:\n{feat_with_weights[:15]}')
print(f'\n{labels[1]}:\n{feat_with_weights[-15:]}')


-------------------------------------------------------
Naive Bayes - TF-IDF
-------------------------------------------------------
Alpha = 0.01:
-------------------------------------------------------
N:
[(-11.491381210629868, '09'), (-11.491381210629868, '127'), (-11.491381210629868, '129'), (-11.491381210629868, '17th'), (-11.491381210629868, '20211'), (-11.491381210629868, '202117'), (-11.491381210629868, '202122'), (-11.491381210629868, '20222021'), (-11.491381210629868, '2030'), (-11.491381210629868, '222'), (-11.491381210629868, '2329'), (-11.491381210629868, '238'), (-11.491381210629868, '250'), (-11.491381210629868, '354'), (-11.491381210629868, '371')]

Y:
[(-5.706052659827187, 'acceptance'), (-5.6530697358048325, 'grant'), (-5.615985102368938, 'letter'), (-5.5612565809041214, 'country'), (-5.557736143665154, 'science'), (-5.540016450905657, 'organisation'), (-5.531964545281696, 'year'), (-5.521393413465941, 'br'), (-5.484795388809013, 'project'), (-5.314406280219622, 'scho

### SVM

In [161]:
from sklearn.svm import SVC

In [162]:
tipos_svn = ['linear', 'rbf', 'sigmoid']
C = [0.1, 0.3, 0.6, 0.8]

In [163]:
def avalia_svm(descricao, tipo, X_train, y_train, X_test, y_test, C):
    print(f'\n{"-"*55}\nSVM - {descricao}\n{"-"*55}')
    print(f'Kernel = {tipo}, C = {C}\n')
    clf = SVC(kernel=tipo, C=C)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    avalia_resultado(y_test, y_pred)

#### SVM - Bag of Words

In [164]:
for tipo in tipos_svn:
    for c in C:
        avalia_svm("Bag of Words", tipo, X_bow_train, y_train.values.ravel(), X_bow_test, y_test, C=c)


-------------------------------------------------------
SVM - Bag of Words
-------------------------------------------------------
Kernel = linear, C = 0.1

 Acurácia:	78.00 %
 Matriz de Confusão:
 [[12  6]
 [ 5 27]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.71      0.67      0.69        18
           Y       0.82      0.84      0.83        32

    accuracy                           0.78        50
   macro avg       0.76      0.76      0.76        50
weighted avg       0.78      0.78      0.78        50


-------------------------------------------------------
SVM - Bag of Words
-------------------------------------------------------
Kernel = linear, C = 0.3

 Acurácia:	78.00 %
 Matriz de Confusão:
 [[12  6]
 [ 5 27]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.71      0.67      0.69        18
           Y       0.82      0.84      0.83        32

    accuracy 

#### SVM - TFIDF

In [165]:
for tipo in tipos_svn:
    for c in C:
        avalia_svm("TF-IDF", tipo, X_tfidf_train, y_train.values.ravel(), X_tfidf_test,y_test, C=c)


-------------------------------------------------------
SVM - TF-IDF
-------------------------------------------------------
Kernel = linear, C = 0.1

 Acurácia:	64.00 %
 Matriz de Confusão:
 [[ 0 18]
 [ 0 32]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.00      0.00      0.00        18
           Y       0.64      1.00      0.78        32

    accuracy                           0.64        50
   macro avg       0.32      0.50      0.39        50
weighted avg       0.41      0.64      0.50        50


-------------------------------------------------------
SVM - TF-IDF
-------------------------------------------------------
Kernel = linear, C = 0.3

 Acurácia:	68.00 %
 Matriz de Confusão:
 [[ 2 16]
 [ 0 32]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       1.00      0.11      0.20        18
           Y       0.67      1.00      0.80        32

    accuracy             

### Random Forest

In [166]:
from numpy.core.umath_tests import inner1d
from sklearn.ensemble import RandomForestClassifier

In [167]:
n_estimadores = [5, 10, 100, 500, 1000]

In [168]:
def avalia_random_forest(descricao, X_train, y_train, X_test, n_est):
    print(f'\n{"-"*60}\nRandom Forest - {descricao}\n{"-"*60}')
    print(f'No estimadores = {n_est}\n')
    classifier = RandomForestClassifier(n_estimators=n_est)
    classifier.fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    avalia_resultado(y_test, y_pred)

#### Random Forest - Bag of Words

In [169]:
for n_est in n_estimadores:
    avalia_random_forest('BoW', X_bow_train, y_train.values.ravel(), X_bow_test, n_est=n_est)


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 5

 Acurácia:	68.00 %
 Matriz de Confusão:
 [[ 6 12]
 [ 4 28]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.60      0.33      0.43        18
           Y       0.70      0.88      0.78        32

    accuracy                           0.68        50
   macro avg       0.65      0.60      0.60        50
weighted avg       0.66      0.68      0.65        50


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 10

 Acurácia:	82.00 %
 Matriz de Confusão:
 [[14  4]
 [ 5 27]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.74      0.78      0.76        18
           Y       0.87      0.84      0.86        32

  

#### Random Forest -TF-IDF

In [170]:
for n_est in n_estimadores:
    avalia_random_forest('BoW', X_tfidf_train, y_train.values.ravel(), X_tfidf_test, n_est=n_est)


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 5

 Acurácia:	72.00 %
 Matriz de Confusão:
 [[11  7]
 [ 7 25]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.61      0.61      0.61        18
           Y       0.78      0.78      0.78        32

    accuracy                           0.72        50
   macro avg       0.70      0.70      0.70        50
weighted avg       0.72      0.72      0.72        50


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 10

 Acurácia:	82.00 %
 Matriz de Confusão:
 [[13  5]
 [ 4 28]]
 Relatório de classificação:
               precision    recall  f1-score   support

           N       0.76      0.72      0.74        18
           Y       0.85      0.88      0.86        32

  

### Cross-Validation BOW

In [198]:
from sklearn.model_selection import cross_val_score
models = [MultinomialNB(alpha=0.01), # 0.01, 0.21, 0.41, 0.61, 0.81
MultinomialNB(alpha=0.21),MultinomialNB(alpha=0.41),MultinomialNB(alpha=0.61),MultinomialNB(alpha=0.81),
# tipos_svn = ['linear', 'rbf', 'sigmoid'] C = [0.1, 0.3, 0.6, 0.8]
 SVC(kernel = 'linear', C = 0.1),SVC(kernel = 'linear', C = 0.3), SVC(kernel = 'linear', C = 0.6),SVC(kernel = 'linear', C = 0.8),
 SVC(kernel = 'rbf', C = 0.1),SVC(kernel = 'rbf', C = 0.3), SVC(kernel = 'rbf', C = 0.6),SVC(kernel = 'rbf', C = 0.8),
 SVC(kernel = 'sigmoid', C = 0.1),SVC(kernel = 'sigmoid', C = 0.3), SVC(kernel = 'sigmoid', C = 0.6),SVC(kernel = 'sigmoid', C = 0.8),
 #[5, 10, 100, 500, 1000]
RandomForestClassifier(5),RandomForestClassifier(10),RandomForestClassifier(100),RandomForestClassifier(500),RandomForestClassifier(1000)]
# score de cada modelo
models_scores = []
for model in models:
    val_scores = cross_val_score(model, X_bow_train, y_train.values.ravel(), cv=50)
    #nome_modelo = type(model).__name__ # somente para exibição
    print(model)
    #parametro_modelo=type(model).__kargs__
    print('Média: {:.2} | Desvio: {:.2}'.format( np.mean(val_scores), np.std(val_scores)))

MultinomialNB(alpha=0.01)
Média: 0.79 | Desvio: 0.24
MultinomialNB(alpha=0.21)
Média: 0.76 | Desvio: 0.24
MultinomialNB(alpha=0.41)
Média: 0.75 | Desvio: 0.24
MultinomialNB(alpha=0.61)
Média: 0.75 | Desvio: 0.24
MultinomialNB(alpha=0.81)
Média: 0.75 | Desvio: 0.24
SVC(C=0.1, kernel='linear')
Média: 0.78 | Desvio: 0.23
SVC(C=0.3, kernel='linear')
Média: 0.77 | Desvio: 0.23
SVC(C=0.6, kernel='linear')
Média: 0.77 | Desvio: 0.23
SVC(C=0.8, kernel='linear')
Média: 0.77 | Desvio: 0.23
SVC(C=0.1)
Média: 0.65 | Desvio: 0.065
SVC(C=0.3)
Média: 0.65 | Desvio: 0.065
SVC(C=0.6)
Média: 0.65 | Desvio: 0.079
SVC(C=0.8)
Média: 0.68 | Desvio: 0.15
SVC(C=0.1, kernel='sigmoid')
Média: 0.65 | Desvio: 0.065
SVC(C=0.3, kernel='sigmoid')
Média: 0.61 | Desvio: 0.14
SVC(C=0.6, kernel='sigmoid')
Média: 0.6 | Desvio: 0.15
SVC(C=0.8, kernel='sigmoid')
Média: 0.59 | Desvio: 0.14
RandomForestClassifier(n_estimators=5)
Média: 0.78 | Desvio: 0.2
RandomForestClassifier(n_estimators=10)
Média: 0.79 | Desvio: 0.22
Rand

In [172]:

# Cross-Validation TFIDF


In [199]:
models = [MultinomialNB(alpha=0.01), # 0.01, 0.21, 0.41, 0.61, 0.81
MultinomialNB(alpha=0.21),MultinomialNB(alpha=0.41),MultinomialNB(alpha=0.61),MultinomialNB(alpha=0.81),
# tipos_svn = ['linear', 'rbf', 'sigmoid'] C = [0.1, 0.3, 0.6, 0.8]
 SVC(kernel = 'linear', C = 0.1),SVC(kernel = 'linear', C = 0.3), SVC(kernel = 'linear', C = 0.6),SVC(kernel = 'linear', C = 0.8),
 SVC(kernel = 'rbf', C = 0.1),SVC(kernel = 'rbf', C = 0.3), SVC(kernel = 'rbf', C = 0.6),SVC(kernel = 'rbf', C = 0.8),
 SVC(kernel = 'sigmoid', C = 0.1),SVC(kernel = 'sigmoid', C = 0.3), SVC(kernel = 'sigmoid', C = 0.6),SVC(kernel = 'sigmoid', C = 0.8),
 #[5, 10, 100, 500, 1000]
RandomForestClassifier(5),RandomForestClassifier(10),RandomForestClassifier(100),RandomForestClassifier(500),RandomForestClassifier(1000)]
# score de cada modelo
models_scores = []
for model in models:
    val_scores = cross_val_score(model, X_tfidf_train, y_train.values.ravel(), cv=50)
    print(model)
    #parametro_modelo=type(model).__kargs__
    print('Média: {:.2} | Desvio: {:.2}'.format( np.mean(val_scores), np.std(val_scores)))

MultinomialNB(alpha=0.01)
Média: 0.75 | Desvio: 0.24
MultinomialNB(alpha=0.21)
Média: 0.75 | Desvio: 0.25
MultinomialNB(alpha=0.41)
Média: 0.77 | Desvio: 0.24
MultinomialNB(alpha=0.61)
Média: 0.71 | Desvio: 0.21
MultinomialNB(alpha=0.81)
Média: 0.7 | Desvio: 0.19
SVC(C=0.1, kernel='linear')
Média: 0.65 | Desvio: 0.065
SVC(C=0.3, kernel='linear')
Média: 0.68 | Desvio: 0.11
SVC(C=0.6, kernel='linear')
Média: 0.71 | Desvio: 0.23
SVC(C=0.8, kernel='linear')
Média: 0.77 | Desvio: 0.23
SVC(C=0.1)
Média: 0.65 | Desvio: 0.065
SVC(C=0.3)
Média: 0.65 | Desvio: 0.065
SVC(C=0.6)
Média: 0.68 | Desvio: 0.15
SVC(C=0.8)
Média: 0.73 | Desvio: 0.2
SVC(C=0.1, kernel='sigmoid')
Média: 0.65 | Desvio: 0.065
SVC(C=0.3, kernel='sigmoid')
Média: 0.65 | Desvio: 0.065
SVC(C=0.6, kernel='sigmoid')
Média: 0.67 | Desvio: 0.24
SVC(C=0.8, kernel='sigmoid')
Média: 0.74 | Desvio: 0.23
RandomForestClassifier(n_estimators=5)
Média: 0.78 | Desvio: 0.2
RandomForestClassifier(n_estimators=10)
Média: 0.77 | Desvio: 0.23
Rand

### Resultados