# Importando Pacotes

In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
import strsimpy
import numpy as np
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

# Paths

In [2]:
path_bases = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Bases"
path_embeddings = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Embeddings"
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Resultados"

# Leitura de Bases

In [3]:
os.chdir(path_bases)
descricoes_pof = pd.read_excel("Descricoes_POF_g1.xlsx")
descricoes_snipc = pd.read_excel("Descricoes_SNIPC_g1.xlsx")
descricoes_pof = descricoes_pof.values.tolist() # Converte dataframe para lista de listas
descricoes_snipc = descricoes_snipc.values.tolist() # Coverte dataframe para lista de listas
descricoes_pof = [item for sublist in descricoes_pof for item in sublist] # Converte lista de listas para lista
descricoes_snipc = [item for sublist in descricoes_snipc for item in sublist] # Converte lista de listas para lista

# Criando Matrizes de Similaridade

In [4]:
## Levenshtein
df_levenshtein = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_levenshtein = pd.DataFrame(df_levenshtein, columns = descricoes_snipc,
index = descricoes_pof)

## Jaro
df_jaro = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaro = pd.DataFrame(df_jaro, columns = descricoes_snipc,
index = descricoes_pof)

## Jaccard
df_jaccard = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaccard = pd.DataFrame(df_jaccard, columns = descricoes_snipc,
index = descricoes_pof)

## TF-IDF
df_tfidf = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_tfidf = pd.DataFrame(df_tfidf, columns = descricoes_snipc,
index = descricoes_pof)

## Word2Vec
df_word2vec_soma = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_soma = pd.DataFrame(df_word2vec_soma, columns = descricoes_snipc,
index = descricoes_pof)
df_word2vec_media = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_media = pd.DataFrame(df_word2vec_media, columns = descricoes_snipc,
index = descricoes_pof)

# Calculando Similaridade

## Levenshtein

In [5]:
levenshtein = NormalizedLevenshtein()
contador = 0
for palavra_pof in df_levenshtein.index:
    for palavra_snipc in df_levenshtein.columns:
        df_levenshtein.at[palavra_pof, palavra_snipc] = levenshtein.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_levenshtein.shape[0] * 100, "%")

41.91616766467065 %
83.8323353293413 %


In [6]:
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Resultados"
os.chdir(path_resultados)
df_levenshtein.to_excel("Levenshtein_g1_MS.xlsx")

## Jaro

In [7]:
jaro = JaroWinkler()
contador = 0
for palavra_pof in df_jaro.index:
    for palavra_snipc in df_jaro.columns:
        df_jaro.at[palavra_pof, palavra_snipc] = jaro.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_jaro.shape[0] * 100, "%")

41.91616766467065 %
83.8323353293413 %


In [8]:
os.chdir(path_resultados)
df_jaro.to_excel("Jaro_g1_MS.xlsx")

## Jaccard

In [9]:
contador = 0
for palavra_pof in df_jaccard.index:
    for palavra_snipc in df_jaccard.columns:
        tok_pof = set(palavra_pof.split())
        tok_snipc = set(palavra_snipc.split())
        numerador = tok_pof & tok_snipc
        denominador = tok_pof | tok_snipc
        df_jaccard.at[palavra_pof, palavra_snipc] = len(numerador) / len(denominador)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_jaccard.shape[0] * 100, "%")

41.91616766467065 %
83.8323353293413 %


In [10]:
os.chdir(path_resultados)
df_jaccard.to_excel("Jaccard_g1_MS.xlsx")

## TF-IDF

In [11]:
vectorizer = TfidfVectorizer()

# Juntando Todas as Descrições e Removendo duplicatas
todas_descricoes = descricoes_pof + descricoes_snipc
todas_descricoes = pd.Series(todas_descricoes).drop_duplicates().to_list()

# Calculando valores TF-IDF
tfidf_matrix = vectorizer.fit_transform(todas_descricoes)

# Transformando Matriz TF-IDF em DF
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=todas_descricoes, columns=vectorizer.get_feature_names_out())

# Filtrando por pesquisas
tfidf_snipc = tfidf_df.filter(items = descricoes_snipc, axis = 0)
tfidf_pof = tfidf_df.filter(items = descricoes_pof, axis = 0)

## Calculando Cosseno e transformando em DF
cosine_sim = cosine_similarity(tfidf_pof, tfidf_snipc)
df_tfidf = pd.DataFrame(cosine_sim, index = descricoes_pof, columns = descricoes_snipc)

In [12]:
os.chdir(path_resultados)
df_tfidf.to_excel("TFIDF_g1_MS.xlsx")

# Word2vec

In [13]:
os.chdir(path_embeddings)
os.getcwd()
model = KeyedVectors.load_word2vec_format("skip_s300.txt")

In [14]:
os.chdir(path_bases)
tokens_all = pd.read_excel("Tokens_All_g1.xlsx")
tokens_all = tokens_all.values.tolist()
tokens_all = [item for sublist in tokens_all for item in sublist]
vocab_set = set(model.index_to_key)
keep_set = set(tokens_all)
drop_set = vocab_set - keep_set
for word in drop_set:
    del model.key_to_index[word]

In [15]:
tokens_nc = pd.read_excel("Tokens_All_NC_g1.xlsx")
tokens_nc = tokens_nc.values.tolist()
tokens_nc = [item for sublist in tokens_nc for item in sublist]

In [16]:
tokens_nc

['cabotian',
 'caxaco',
 'moganga',
 'ajiru',
 'guajuru',
 'uajuru',
 'cuxa',
 'avoador',
 'beribá',
 'brajola',
 'encartonado',
 'drageado',
 'fagotini',
 'gevral',
 'capeleti',
 'semipronta',
 'raviole',
 'rotolone',
 'multiervas',
 'nibis',
 'polenguinho',
 'multigrãos',
 'abrote',
 'anujá',
 'caicanha',
 'corcoroca',
 'guaivira',
 'piranambu',
 'pirapeua',
 'sairu',
 'tiravira',
 'aracapuri',
 'aruiri',
 'chaperema',
 'cujuba',
 'sambuio',
 'mucum',
 'muzarella',
 'parmezon',
 'rambutan',
 'aloá',
 'sarolho',
 'schimier',
 'steinharger',
 'sustagem',
 'sazon',
 'enxofrão',
 'fisális',
 'semipreparada',
 'pirapitanga',
 'rambutã']

In [17]:
matriz_aux = np.zeros((300, ), dtype = "float32")
for token in tokens_nc:
    model[token] = matriz_aux

## Soma de Embeddings

In [18]:
def embedding_soma(sentence, model, num_features, word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
    model[sentence] = feature_vec

In [19]:
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_soma(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
0.463821892393321 %
0.927643784786642 %
1.391465677179963 %
1.855287569573284 %
2.3191094619666046 %
2.782931354359926 %
3.2467532467532463 %
3.710575139146568 %
4.174397031539889 %
4.638218923933209 %
5.1020408163265305 %
5.565862708719852 %
6.029684601113173 %
6.493506493506493 %
6.957328385899815 %
7.421150278293136 %
7.884972170686456 %
8.348794063079778 %
8.812615955473097 %
9.276437847866418 %
9.740259740259742 %
10.204081632653061 %
10.667903525046382 %
11.131725417439704 %
11.595547309833023 %
12.059369202226346 %
12.523191094619666 %
12.987012987012985 %
13.45083487940631 %
13.91465677179963 %
14.37847866419295 %
14.842300556586272 %
15.306122448979592 %
15.769944341372913 %
16.233766233766232 %
16.697588126159555 %
17.161410018552875 %
17.625231910946194 %
18.089053803339517 %
18.552875695732837 %
19.01669758812616 %
19.480519480519483 %
19.944341372912803 %
20.408163265306122 %
20.871985157699445 %
21.335807050092765 %
21.799628942486084 %
22.263450834879407 %
22.72727

In [20]:
contador = 0
for palavra_pof in df_word2vec_soma.index:
    for palavra_snipc in df_word2vec_soma.columns:
        df_word2vec_soma.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_soma.shape[0] * 100, "%")

0.5988023952095809 %
1.1976047904191618 %
1.7964071856287425 %
2.3952095808383236 %
2.9940119760479043 %
3.592814371257485 %
4.191616766467066 %
4.790419161676647 %
5.389221556886228 %
5.9880239520958085 %
6.58682634730539 %
7.18562874251497 %
7.784431137724551 %
8.383233532934131 %
8.982035928143713 %
9.580838323353294 %
10.179640718562874 %
10.778443113772456 %
11.377245508982035 %
11.976047904191617 %
12.574850299401197 %
13.17365269461078 %
13.77245508982036 %
14.37125748502994 %
14.97005988023952 %
15.568862275449103 %
16.16766467065868 %
16.766467065868262 %
17.365269461077844 %
17.964071856287426 %
18.562874251497004 %
19.16167664670659 %
19.760479041916167 %
20.35928143712575 %
20.958083832335326 %
21.55688622754491 %
22.15568862275449 %
22.75449101796407 %
23.353293413173652 %
23.952095808383234 %
24.550898203592812 %
25.149700598802394 %
25.748502994011975 %
26.34730538922156 %
26.94610778443114 %
27.54491017964072 %
28.143712574850298 %
28.74251497005988 %
29.34131736526946 

In [21]:
os.chdir(path_resultados)
df_word2vec_soma.to_excel("Word2VecSoma_g1_MS.xlsx")

## Word2Vec Média

In [22]:
def embedding_media(sentence, model, num_features, word_set):
    words = sentence.split()
    n_words = 0
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
            n_words = n_words + 1
    model[sentence] = feature_vec / n_words

In [23]:
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_media(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
0.463821892393321 %
0.927643784786642 %
1.391465677179963 %
1.855287569573284 %
2.3191094619666046 %
2.782931354359926 %
3.2467532467532463 %
3.710575139146568 %
4.174397031539889 %
4.638218923933209 %
5.1020408163265305 %
5.565862708719852 %
6.029684601113173 %
6.493506493506493 %
6.957328385899815 %
7.421150278293136 %
7.884972170686456 %
8.348794063079778 %
8.812615955473097 %
9.276437847866418 %
9.740259740259742 %
10.204081632653061 %
10.667903525046382 %
11.131725417439704 %
11.595547309833023 %
12.059369202226346 %
12.523191094619666 %
12.987012987012985 %
13.45083487940631 %
13.91465677179963 %
14.37847866419295 %
14.842300556586272 %
15.306122448979592 %
15.769944341372913 %
16.233766233766232 %
16.697588126159555 %
17.161410018552875 %
17.625231910946194 %
18.089053803339517 %
18.552875695732837 %
19.01669758812616 %
19.480519480519483 %
19.944341372912803 %
20.408163265306122 %
20.871985157699445 %
21.335807050092765 %
21.799628942486084 %
22.263450834879407 %
22.72727

In [24]:
contador = 0
for palavra_pof in df_word2vec_media.index:
    for palavra_snipc in df_word2vec_media.columns:
        df_word2vec_media.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_media.shape[0] * 100, "%")

0.5988023952095809 %
1.1976047904191618 %
1.7964071856287425 %
2.3952095808383236 %
2.9940119760479043 %
3.592814371257485 %
4.191616766467066 %
4.790419161676647 %
5.389221556886228 %
5.9880239520958085 %
6.58682634730539 %
7.18562874251497 %
7.784431137724551 %
8.383233532934131 %
8.982035928143713 %
9.580838323353294 %
10.179640718562874 %
10.778443113772456 %
11.377245508982035 %
11.976047904191617 %
12.574850299401197 %
13.17365269461078 %
13.77245508982036 %
14.37125748502994 %
14.97005988023952 %
15.568862275449103 %
16.16766467065868 %
16.766467065868262 %
17.365269461077844 %
17.964071856287426 %
18.562874251497004 %
19.16167664670659 %
19.760479041916167 %
20.35928143712575 %
20.958083832335326 %
21.55688622754491 %
22.15568862275449 %
22.75449101796407 %
23.353293413173652 %
23.952095808383234 %
24.550898203592812 %
25.149700598802394 %
25.748502994011975 %
26.34730538922156 %
26.94610778443114 %
27.54491017964072 %
28.143712574850298 %
28.74251497005988 %
29.34131736526946 

In [25]:
os.chdir(path_resultados)
df_word2vec_media.to_excel("Word2VecMedia_g1_MS.xlsx")

In [28]:
contador

3305