# Importando Pacotes

In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
import strsimpy
import numpy as np
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

# Paths

In [2]:
path_bases = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\Bases"
path_embeddings = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\Embeddings"
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\Resultados"

# Leitura de Bases

In [3]:
os.chdir(path_bases)
descricoes_pof = pd.read_excel("Descricoes_POF.xlsx")
descricoes_snipc = pd.read_excel("Descricoes_SNIPC.xlsx")
descricoes_pof = descricoes_pof.values.tolist() # Converte dataframe para lista de listas
descricoes_snipc = descricoes_snipc.values.tolist() # Coverte dataframe para lista de listas
descricoes_pof = [item for sublist in descricoes_pof for item in sublist] # Converte lista de listas para lista
descricoes_snipc = [item for sublist in descricoes_snipc for item in sublist] # Converte lista de listas para lista

# Criando Matrizes de Similaridade

## Separando Descrições por Pesquisa

## Levenshtein

In [4]:
df_levenshtein = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_levenshtein = pd.DataFrame(df_levenshtein, columns = descricoes_snipc,
index = descricoes_pof)

## Jaro

In [5]:
df_jaro = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaro = pd.DataFrame(df_jaro, columns = descricoes_snipc,
index = descricoes_pof)

## Jaccard

In [6]:
df_jaccard = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaccard = pd.DataFrame(df_jaccard, columns = descricoes_snipc,
index = descricoes_pof)

## tf-idf

In [7]:
df_tfidf = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_tfidf = pd.DataFrame(df_tfidf, columns = descricoes_snipc,
index = descricoes_pof)

## Word2Vec

In [14]:
df_word2vec_soma = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_soma = pd.DataFrame(df_word2vec_soma, columns = descricoes_snipc,
index = descricoes_pof)
df_word2vec_media = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_media = pd.DataFrame(df_word2vec_media, columns = descricoes_snipc,
index = descricoes_pof)

# Calculando Similaridade

## Levenshtein

In [9]:
levenshtein = NormalizedLevenshtein()
contador = 0
for palavra_pof in df_levenshtein.index:
    for palavra_snipc in df_levenshtein.columns:
        df_levenshtein.at[palavra_pof, palavra_snipc] = levenshtein.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3391:
        print(contador / df_levenshtein.shape[0] * 100, "%")

20.642878207018576 %
41.28575641403715 %
61.928634621055735 %
82.5715128280743 %
100.0 %


In [10]:
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\Resultados"
os.chdir(path_resultados)
df_levenshtein.to_excel("Levenshtein_MS.xlsx")

## Jaro

In [12]:
jaro = JaroWinkler()
contador = 0
for palavra_pof in df_jaro.index:
    for palavra_snipc in df_jaro.columns:
        df_jaro.at[palavra_pof, palavra_snipc] = jaro.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3391:
        print(contador / df_jaro.shape[0] * 100, "%")

20.642878207018576 %
41.28575641403715 %
61.928634621055735 %
82.5715128280743 %
100.0 %


In [13]:
os.chdir(path_resultados)
df_jaro.to_excel("Jaro_MS.xlsx")

## Jaccard

In [14]:
contador = 0
for palavra_pof in df_jaccard.index:
    for palavra_snipc in df_jaccard.columns:
        tok_pof = set(palavra_pof.split())
        tok_snipc = set(palavra_snipc.split())
        numerador = tok_pof & tok_snipc
        denominador = tok_pof | tok_snipc
        df_jaccard.at[palavra_pof, palavra_snipc] = len(numerador) / len(denominador)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3391:
        print(contador / df_jaccard.shape[0] * 100, "%")

20.642878207018576 %
41.28575641403715 %
61.928634621055735 %
82.5715128280743 %
100.0 %


In [15]:
os.chdir(path_resultados)
df_jaccard.to_excel("Jaccard_MS.xlsx")

## TF-IDF

In [16]:
vectorizer = TfidfVectorizer()

# Juntando Todas as Descrições e Removendo duplicatas
todas_descricoes = descricoes_pof + descricoes_snipc
todas_descricoes = pd.Series(todas_descricoes).drop_duplicates().to_list()

# Calculando valores TF-IDF
tfidf_matrix = vectorizer.fit_transform(todas_descricoes)

# Transformando Matriz TF-IDF em DF
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=todas_descricoes, columns=vectorizer.get_feature_names_out())

# Filtrando por pesquisas
tfidf_snipc = tfidf_df.filter(items = descricoes_snipc, axis = 0)
tfidf_pof = tfidf_df.filter(items = descricoes_pof, axis = 0)

## Calculando Cosseno e transformando em DF
cosine_sim = cosine_similarity(tfidf_pof, tfidf_snipc)
df_tfidf = pd.DataFrame(cosine_sim, index = descricoes_pof, columns = descricoes_snipc)

In [17]:
os.chdir(path_resultados)
df_tfidf.to_excel("TFIDF_MS.xlsx")

## Word2Vec

### Lendo Embeddings

In [3]:
os.chdir(path_embeddings)
os.getcwd()
model = KeyedVectors.load_word2vec_format("skip_s300.txt")

# Testando base nova (sem sentencas de palavras unicas nao contidas nos embeddings)
os.chdir(path_bases)
descricoes_pof = pd.read_excel("Descricoes_POF_CE.xlsx")
descricoes_snipc = pd.read_excel("Descricoes_SNIPC_CE.xlsx")
descricoes_pof = descricoes_pof.values.tolist() # Converte dataframe para lista de listas
descricoes_snipc = descricoes_snipc.values.tolist() # Coverte dataframe para lista de listas
descricoes_pof = [item for sublist in descricoes_pof for item in sublist] # Converte lista de listas para lista
descricoes_snipc = [item for sublist in descricoes_snipc for item in sublist] # Converte lista de listas para lista

In [3]:
tokens_all = pd.read_excel("Tokens_All_CE.xlsx")
tokens_all = tokens_all.values.tolist()
tokens_all = [item for sublist in tokens_all for item in sublist]
vocab_set = set(model.index_to_key)
keep_set = set(tokens_all)
drop_set = vocab_set - keep_set
for word in drop_set:
    del model.key_to_index[word]
model.save_word2vec_format("testando_emb.txt", binary=False)

NameError: name 'os' is not defined

In [48]:
model["samurai"]

KeyError: "Key 'samurai' not present"

### Word2Vec Soma

In [5]:
def embedding_soma(string):
    palavras = string.split()
    palavras = [item for item in palavras if item in model] # remove palavras não contidas no modelo note que seguindo dessa forma a sentenca "samurai caxaco" vai ter o mesmo embedding que "samurai" pois "caxaco" nao está contido no embedding
    resultado = sum([model[palavra] for palavra in palavras])
    return resultado

## Cria Embeddings das Sentenças por Soma

In [6]:
def embedding_soma(sentence, model, num_features, word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
    model[sentence] = feature_vec

In [49]:
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_soma(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
0.21777003484320556 %
0.4355400696864111 %
0.6533101045296167 %
0.8710801393728222 %
1.088850174216028 %
1.3066202090592334 %
1.524390243902439 %
1.7421602787456445 %
1.95993031358885 %
2.177700348432056 %
2.3954703832752613 %
2.6132404181184667 %
2.8310104529616726 %
3.048780487804878 %
3.266550522648084 %
3.484320557491289 %
3.702090592334495 %
3.9198606271777 %
4.137630662020906 %
4.355400696864112 %
4.573170731707317 %
4.790940766550523 %
5.008710801393728 %
5.2264808362369335 %
5.444250871080139 %
5.662020905923345 %
5.879790940766551 %
6.097560975609756 %
6.315331010452961 %
6.533101045296168 %
6.7508710801393725 %
6.968641114982578 %
7.186411149825784 %
7.40418118466899 %
7.621951219512195 %
7.8397212543554 %
8.057491289198607 %
8.275261324041812 %
8.493031358885016 %
8.710801393728223 %
8.928571428571429 %
9.146341463414634 %
9.36411149825784 %
9.581881533101045 %
9.79965156794425 %
10.017421602787456 %
10.235191637630663 %
10.452961672473867 %
10.670731707317072 %
10.888

88.63240418118467 %
88.85017421602788 %
89.06794425087108 %
89.28571428571429 %
89.50348432055749 %
89.72125435540069 %
89.9390243902439 %
90.1567944250871 %
90.37456445993031 %
90.59233449477352 %
90.81010452961672 %
91.02787456445994 %
91.24564459930313 %
91.46341463414635 %
91.68118466898954 %
91.89895470383276 %
92.11672473867596 %
92.33449477351915 %
92.55226480836237 %
92.77003484320558 %
92.98780487804879 %
93.20557491289199 %
93.42334494773519 %
93.6411149825784 %
93.8588850174216 %
94.07665505226481 %
94.29442508710801 %
94.51219512195121 %
94.72996515679442 %
94.94773519163763 %
95.16550522648085 %
95.38327526132404 %
95.60104529616724 %
95.81881533101046 %
96.03658536585365 %
96.25435540069687 %
96.47212543554006 %
96.68989547038328 %
96.90766550522648 %
97.12543554006969 %
97.3432055749129 %
97.5609756097561 %
97.77874564459931 %
97.99651567944251 %
98.21428571428571 %
98.43205574912892 %
98.64982578397212 %
98.86759581881533 %
99.08536585365853 %
99.30313588850174 %
99.520

In [56]:
df_word2vec_soma.columns

Index(['abóbora', 'abóbora d água', 'abóbora moranga', 'abobrinha',
       'absorvente higiênico', 'açaí emulsão', 'acendedor fogão',
       'acesso internet', 'acessórios impressora',
       'acessórios máquina fotográfica',
       ...
       'vinagre maçã', 'vinho', 'vinho orgânico', 'víscera carneiro',
       'víscera porco', 'vitamina fortificante', 'wasabi', 'windsurfe',
       'xampu carpete', 'xampu veículo'],
      dtype='object', length=1216)

In [57]:
contador = 0
for palavra_pof in df_word2vec_soma.index:
    for palavra_snipc in df_word2vec_soma.columns:
        df_word2vec_soma.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_soma.shape[0] * 100, "%")

0.2962085308056872 %
0.5924170616113744 %
0.8886255924170616 %
1.1848341232227488 %
1.481042654028436 %
1.7772511848341233 %
2.0734597156398102 %
2.3696682464454977 %
2.665876777251185 %
2.962085308056872 %
3.2582938388625595 %
3.5545023696682465 %
3.850710900473934 %
4.1469194312796205 %
4.443127962085308 %
4.739336492890995 %
5.035545023696683 %
5.33175355450237 %
5.627962085308057 %
5.924170616113744 %
6.220379146919431 %
6.516587677725119 %
6.812796208530806 %
7.109004739336493 %
7.4052132701421804 %
7.701421800947868 %
7.9976303317535535 %
8.293838862559241 %
8.590047393364928 %
8.886255924170616 %
9.182464454976303 %
9.47867298578199 %
9.774881516587678 %
10.071090047393366 %
10.367298578199053 %
10.66350710900474 %
10.959715639810426 %
11.255924170616113 %
11.552132701421801 %
11.848341232227488 %
12.144549763033176 %
12.440758293838861 %
12.736966824644549 %
13.033175355450238 %
13.329383886255924 %
13.625592417061611 %
13.921800947867299 %
14.218009478672986 %
14.5142180094786

In [58]:
os.chdir(path_resultados)
df_word2vec_soma.to_excel("Word2VecSoma_MS.xlsx")

In [59]:
model.similarity("abóbora cabotian", "acesso internet")

-0.0394747

In [51]:
contador = 0
for palavra_pof in df_word2vec_soma.index:
    soma_pof = embedding_soma(palavra_pof)
    model[palavra_pof] = soma_pof # Se uma palavra de uma sentenca nao esta contida no embedding, mantemos ela mas ela nao altera a soma
    for palavra_snipc in df_word2vec_soma.columns:
        soma_snipc = embedding_soma(palavra_snipc)
        model[palavra_snipc] = soma_snipc
        df_word2vec_soma.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3391:
        print(contador / df_word2vec_soma.shape[0] * 100, "%")

KeyboardInterrupt: 

In [None]:
contador = 0
for palavra_pof in df_word2vec_soma.index:
    soma_pof = embedding_soma(palavra_pof)
    model[palavra_pof] = soma_pof # Se uma palavra de uma sentenca nao esta contida no embedding, mantemos ela mas ela nao altera a soma
    for palavra_snipc in df_word2vec_soma.columns:
        soma_snipc = embedding_soma(palavra_snipc)
        model[palavra_snipc] = soma_snipc
        df_word2vec_soma.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3391:
        print(contador / df_word2vec_soma.shape[0] * 100, "%")

In [84]:
for index, row in df_word2vec_soma.iterrows():
    soma_pof = embedding_soma(index)
    soma_snipc

In [85]:
i = 0
for index, row in df_word2vec_soma.iterrows():
    print(row.index)

array([ 0.176912,  0.15762 ,  0.10616 ,  0.546431,  0.366537, -0.014038,
       -0.15314 ,  0.153705,  0.175341, -0.381698, -0.05135 , -0.093753,
       -0.113221,  0.360536,  0.178126,  0.219126,  0.082147,  0.358772,
       -0.336903, -0.253408, -0.027332, -0.318542, -0.176809,  0.20854 ,
       -0.021603,  0.198546,  0.150315,  0.308164,  0.315833, -0.320132,
        0.249261,  0.026871, -0.633253,  0.058548,  0.220274, -0.066906,
        0.359219, -0.417362, -0.314825, -0.207338,  0.018806,  0.103853,
        0.166398,  0.171834, -0.035437,  0.012877,  0.349015, -0.310928,
        0.488949,  0.18013 , -0.195094, -0.490868, -0.032826, -0.055094,
       -0.281857,  0.032413,  0.066699,  0.367216, -0.212529, -0.025361,
        0.058342, -0.144595, -0.151717, -0.126274, -0.284667, -0.020377,
       -0.273635,  0.21694 ,  0.080277,  0.028561,  0.34737 ,  0.051262,
        0.037115, -0.274287, -0.072886,  0.028719, -0.174581,  0.264425,
        0.012051,  0.00577 ,  0.652825, -0.126999, 

In [72]:
df_word2vec_soma["teste"]

KeyError: 'teste'

In [44]:
model["antianêmico"] #problema em antianemico pois essa palavra nao está contida e está sozinha, diferente de "samurai caxaco"

KeyError: "Key 'antianêmico' not present"

In [53]:
os.chdir(path_resultados)
df_word2vec_soma.to_excel("W2V_Soma_MS.xlsx")

### Word2Vec Média

In [None]:
# deleta
def embedding_media(string):
    palavras = string.split()
    n_palavras = len(palavras)
    soma = sum([model[palavra] for palavra in palavras])
    resultado = soma / n_palavras
    return resultado

In [5]:
# versao nova
def embedding_media(sentence, model, num_features, word_set):
    words = sentence.split()
    n_words = 0
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
            n_words = n_words + 1
    model[sentence] = feature_vec / n_words

In [9]:
embedding_media("abóbora branca", model, 300, set(model.index_to_key))

In [13]:
model["abóbora branca"] == (model["abóbora"] + model["branca"])/2

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [12]:
# versao nova
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_media(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
0.21777003484320556 %
0.4355400696864111 %
0.6533101045296167 %
0.8710801393728222 %
1.088850174216028 %
1.3066202090592334 %
1.524390243902439 %
1.7421602787456445 %
1.95993031358885 %
2.177700348432056 %
2.3954703832752613 %
2.6132404181184667 %
2.8310104529616726 %
3.048780487804878 %
3.266550522648084 %
3.484320557491289 %
3.702090592334495 %
3.9198606271777 %
4.137630662020906 %
4.355400696864112 %
4.573170731707317 %
4.790940766550523 %
5.008710801393728 %
5.2264808362369335 %
5.444250871080139 %
5.662020905923345 %
5.879790940766551 %
6.097560975609756 %
6.315331010452961 %
6.533101045296168 %
6.7508710801393725 %
6.968641114982578 %
7.186411149825784 %
7.40418118466899 %
7.621951219512195 %
7.8397212543554 %
8.057491289198607 %
8.275261324041812 %
8.493031358885016 %
8.710801393728223 %
8.928571428571429 %
9.146341463414634 %
9.36411149825784 %
9.581881533101045 %
9.79965156794425 %
10.017421602787456 %
10.235191637630663 %
10.452961672473867 %
10.670731707317072 %
10.888

88.63240418118467 %
88.85017421602788 %
89.06794425087108 %
89.28571428571429 %
89.50348432055749 %
89.72125435540069 %
89.9390243902439 %
90.1567944250871 %
90.37456445993031 %
90.59233449477352 %
90.81010452961672 %
91.02787456445994 %
91.24564459930313 %
91.46341463414635 %
91.68118466898954 %
91.89895470383276 %
92.11672473867596 %
92.33449477351915 %
92.55226480836237 %
92.77003484320558 %
92.98780487804879 %
93.20557491289199 %
93.42334494773519 %
93.6411149825784 %
93.8588850174216 %
94.07665505226481 %
94.29442508710801 %
94.51219512195121 %
94.72996515679442 %
94.94773519163763 %
95.16550522648085 %
95.38327526132404 %
95.60104529616724 %
95.81881533101046 %
96.03658536585365 %
96.25435540069687 %
96.47212543554006 %
96.68989547038328 %
96.90766550522648 %
97.12543554006969 %
97.3432055749129 %
97.5609756097561 %
97.77874564459931 %
97.99651567944251 %
98.21428571428571 %
98.43205574912892 %
98.64982578397212 %
98.86759581881533 %
99.08536585365853 %
99.30313588850174 %
99.520

In [15]:
contador = 0
for palavra_pof in df_word2vec_media.index:
    for palavra_snipc in df_word2vec_media.columns:
        df_word2vec_media.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_media.shape[0] * 100, "%")

0.2962085308056872 %
0.5924170616113744 %
0.8886255924170616 %
1.1848341232227488 %
1.481042654028436 %
1.7772511848341233 %
2.0734597156398102 %
2.3696682464454977 %
2.665876777251185 %
2.962085308056872 %
3.2582938388625595 %
3.5545023696682465 %
3.850710900473934 %
4.1469194312796205 %
4.443127962085308 %
4.739336492890995 %
5.035545023696683 %
5.33175355450237 %
5.627962085308057 %
5.924170616113744 %
6.220379146919431 %
6.516587677725119 %
6.812796208530806 %
7.109004739336493 %
7.4052132701421804 %
7.701421800947868 %
7.9976303317535535 %
8.293838862559241 %
8.590047393364928 %
8.886255924170616 %
9.182464454976303 %
9.47867298578199 %
9.774881516587678 %
10.071090047393366 %
10.367298578199053 %
10.66350710900474 %
10.959715639810426 %
11.255924170616113 %
11.552132701421801 %
11.848341232227488 %
12.144549763033176 %
12.440758293838861 %
12.736966824644549 %
13.033175355450238 %
13.329383886255924 %
13.625592417061611 %
13.921800947867299 %
14.218009478672986 %
14.5142180094786

In [16]:
os.chdir(path_resultados)
df_word2vec_media.to_excel("W2V_Media_MS.xlsx")