# Importando Pacotes

In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
import strsimpy
import numpy as np
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

# Paths

In [2]:
path_bases = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Bases"
path_embeddings = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Embeddings"
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Resultados"

# Leitura de Bases

In [3]:
os.chdir(path_bases)
descricoes_pof = pd.read_excel("Descricoes_POF_TodosSinonimos.xlsx")
descricoes_snipc = pd.read_excel("Descricoes_SNIPC_TodosSinonimos.xlsx")
descricoes_pof = descricoes_pof.values.tolist() # Converte dataframe para lista de listas
descricoes_snipc = descricoes_snipc.values.tolist() # Coverte dataframe para lista de listas
descricoes_pof = [item for sublist in descricoes_pof for item in sublist] # Converte lista de listas para lista
descricoes_snipc = [item for sublist in descricoes_snipc for item in sublist] # Converte lista de listas para lista

# Criando Matrizes de Similaridade

In [4]:
## Levenshtein
df_levenshtein = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_levenshtein = pd.DataFrame(df_levenshtein, columns = descricoes_snipc,
index = descricoes_pof)

## Jaro
df_jaro = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaro = pd.DataFrame(df_jaro, columns = descricoes_snipc,
index = descricoes_pof)

## Jaccard
df_jaccard = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaccard = pd.DataFrame(df_jaccard, columns = descricoes_snipc,
index = descricoes_pof)

## TF-IDF
df_tfidf = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_tfidf = pd.DataFrame(df_tfidf, columns = descricoes_snipc,
index = descricoes_pof)

## Word2Vec
df_word2vec_soma = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_soma = pd.DataFrame(df_word2vec_soma, columns = descricoes_snipc,
index = descricoes_pof)
df_word2vec_media = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_media = pd.DataFrame(df_word2vec_media, columns = descricoes_snipc,
index = descricoes_pof)

# Calculando Similaridade

## Levenshtein

In [5]:
levenshtein = NormalizedLevenshtein()
contador = 0
for palavra_pof in df_levenshtein.index:
    for palavra_snipc in df_levenshtein.columns:
        df_levenshtein.at[palavra_pof, palavra_snipc] = levenshtein.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_levenshtein.shape[0] * 100, "%")

21.18003025718608 %


KeyboardInterrupt: 

In [None]:
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Resultados"
os.chdir(path_resultados)
df_levenshtein.to_excel("Levenshtein_TodosSinonimos_MS.xlsx")

## Jaro

In [9]:
jaro = JaroWinkler()
contador = 0
for palavra_pof in df_jaro.index:
    for palavra_snipc in df_jaro.columns:
        df_jaro.at[palavra_pof, palavra_snipc] = jaro.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_jaro.shape[0] * 100, "%")

21.18003025718608 %
42.36006051437216 %
63.54009077155824 %
84.72012102874432 %
100.0 %


In [10]:
os.chdir(path_resultados)
df_jaro.to_excel("Jaro_TodosSinonimos_MS.xlsx")

## Jaccard

In [11]:
contador = 0
for palavra_pof in df_jaccard.index:
    for palavra_snipc in df_jaccard.columns:
        tok_pof = set(palavra_pof.split())
        tok_snipc = set(palavra_snipc.split())
        numerador = tok_pof & tok_snipc
        denominador = tok_pof | tok_snipc
        df_jaccard.at[palavra_pof, palavra_snipc] = len(numerador) / len(denominador)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_jaccard.shape[0] * 100, "%")

21.18003025718608 %
42.36006051437216 %
63.54009077155824 %
84.72012102874432 %
100.0 %


In [12]:
os.chdir(path_resultados)
df_jaccard.to_excel("Jaccard_TodosSinonimos_MS.xlsx")

## TF-IDF

In [13]:
vectorizer = TfidfVectorizer()

# Juntando Todas as Descrições e Removendo duplicatas
todas_descricoes = descricoes_pof + descricoes_snipc
todas_descricoes = pd.Series(todas_descricoes).drop_duplicates().to_list()

# Calculando valores TF-IDF
tfidf_matrix = vectorizer.fit_transform(todas_descricoes)

# Transformando Matriz TF-IDF em DF
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=todas_descricoes, columns=vectorizer.get_feature_names_out())

# Filtrando por pesquisas
tfidf_snipc = tfidf_df.filter(items = descricoes_snipc, axis = 0)
tfidf_pof = tfidf_df.filter(items = descricoes_pof, axis = 0)

## Calculando Cosseno e transformando em DF
cosine_sim = cosine_similarity(tfidf_pof, tfidf_snipc)
df_tfidf = pd.DataFrame(cosine_sim, index = descricoes_pof, columns = descricoes_snipc)

In [14]:
os.chdir(path_resultados)
df_tfidf.to_excel("TFIDF_TodosSinonimos_MS.xlsx")

# Word2vec

In [15]:
os.chdir(path_embeddings)
os.getcwd()
model = KeyedVectors.load_word2vec_format("skip_s300.txt")

In [16]:
os.chdir(path_bases)
tokens_all = pd.read_excel("Tokens_All_TodosSinonimos.xlsx")
tokens_all = tokens_all.values.tolist()
tokens_all = [item for sublist in tokens_all for item in sublist]
vocab_set = set(model.index_to_key)
keep_set = set(tokens_all)
drop_set = vocab_set - keep_set
for word in drop_set:
    del model.key_to_index[word]

In [17]:
tokens_nc = pd.read_excel("Tokens_All_NC_TodosSinonimos.xlsx")
tokens_nc = tokens_nc.values.tolist()
tokens_nc = [item for sublist in tokens_nc for item in sublist]

In [18]:
tokens_nc

['cabotian',
 'caxaco',
 'moganga',
 'moganguice',
 'mogiganga',
 'monada',
 'moquenquice',
 'maratimba',
 'pioca',
 'restingueiro',
 'tapiocano',
 'absorbente',
 'feminal',
 'femíneo',
 'fêmeo',
 'avizinhação',
 'adiáforo',
 'suplementário',
 'catamênio',
 'circunstanciativo',
 'minudencioso',
 'legente',
 'celulado',
 'celulífero',
 'celuloso',
 'nosocômico',
 'blandície',
 'melifluidade',
 'indefinito',
 'edulcorativo',
 'pousadia',
 'abrigadoiro',
 'abrigadouro',
 'gabinardo',
 'ameninado',
 'menineiro',
 'meninil',
 'apendente',
 'esgotadura',
 'escoadoiro',
 'anorgânico',
 'anhanho',
 'apancadado',
 'apancado',
 'bajoujo',
 'bucuva',
 'inhenho',
 'maninelo',
 'acúmen',
 'picaroto',
 'desalagar',
 'desempachar',
 'desentaipar',
 'desentravar',
 'desestorvar',
 'laxar',
 'ajiru',
 'guajuru',
 'uajuru',
 'tranqueta',
 'átlas',
 'adustível',
 'abstergência',
 'abstersão',
 'alimpadura',
 'alimpamento',
 'limpamento',
 'mundícia',
 'mundície',
 'abastamento',
 'amaestrar',
 'aviventaç

In [19]:
matriz_aux = np.zeros((300, ), dtype = "float32")
for token in tokens_nc:
    model[token] = matriz_aux

## Soma de Embeddings

In [20]:
def embedding_soma(sentence, model, num_features, word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
    model[sentence] = feature_vec

In [21]:
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_soma(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
0.224517287831163 %
0.449034575662326 %
0.673551863493489 %
0.898069151324652 %
1.1225864391558151 %
1.347103726986978 %
1.5716210148181409 %
1.796138302649304 %
2.020655590480467 %
2.2451728783116303 %
2.469690166142793 %
2.694207453973956 %
2.918724741805119 %
3.1432420296362817 %
3.3677593174674447 %
3.592276605298608 %
3.816793893129771 %
4.041311180960934 %
4.265828468792097 %
4.4903457566232605 %
4.7148630444544235 %
4.939380332285586 %
5.163897620116749 %
5.388414907947912 %
5.612932195779075 %
5.837449483610238 %
6.0619667714414005 %
6.2864840592725635 %
6.511001347103727 %
6.7355186349348894 %
6.960035922766053 %
7.184553210597216 %
7.409070498428378 %
7.633587786259542 %
7.858105074090704 %
8.082622361921867 %
8.307139649753031 %
8.531656937584193 %
8.756174225415357 %
8.980691513246521 %
9.205208801077683 %
9.429726088908847 %
9.654243376740009 %
9.878760664571171 %
10.103277952402335 %
10.327795240233497 %
10.552312528064661 %
10.776829815895823 %
11.001347103726987 %

91.15401885945218 %
91.37853614728334 %
91.6030534351145 %
91.82757072294567 %
92.05208801077683 %
92.27660529860799 %
92.50112258643915 %
92.72563987427031 %
92.95015716210148 %
93.17467444993265 %
93.39919173776381 %
93.62370902559498 %
93.84822631342612 %
94.07274360125729 %
94.29726088908846 %
94.52177817691963 %
94.74629546475079 %
94.97081275258195 %
95.19533004041311 %
95.41984732824427 %
95.64436461607544 %
95.8688819039066 %
96.09339919173776 %
96.31791647956892 %
96.5424337674001 %
96.76695105523126 %
96.99146834306241 %
97.21598563089357 %
97.44050291872475 %
97.66502020655591 %
97.88953749438707 %
98.11405478221823 %
98.3385720700494 %
98.56308935788056 %
98.78760664571172 %
99.01212393354288 %
99.23664122137404 %
99.4611585092052 %
99.68567579703638 %
99.91019308486754 %


In [22]:
contador = 0
for palavra_pof in df_word2vec_soma.index:
    for palavra_snipc in df_word2vec_soma.columns:
        df_word2vec_soma.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_soma.shape[0] * 100, "%")

0.30257186081694404 %
0.6051437216338881 %
0.9077155824508321 %
1.2102874432677762 %
1.5128593040847202 %
1.8154311649016641 %
2.118003025718608 %
2.4205748865355523 %
2.723146747352496 %
3.0257186081694405 %
3.3282904689863844 %
3.6308623298033282 %
3.9334341906202726 %
4.236006051437216 %
4.53857791225416 %
4.841149773071105 %
5.143721633888049 %
5.446293494704992 %
5.748865355521937 %
6.051437216338881 %
6.354009077155824 %
6.656580937972769 %
6.959152798789712 %
7.2617246596066565 %
7.564296520423601 %
7.866868381240545 %
8.169440242057489 %
8.472012102874432 %
8.774583963691377 %
9.07715582450832 %
9.379727685325264 %
9.68229954614221 %
9.984871406959153 %
10.287443267776098 %
10.59001512859304 %
10.892586989409985 %
11.195158850226928 %
11.497730711043873 %
11.800302571860817 %
12.102874432677762 %
12.405446293494705 %
12.708018154311649 %
13.010590015128592 %
13.313161875945537 %
13.61573373676248 %
13.918305597579424 %
14.22087745839637 %
14.523449319213313 %
14.826021180030258

In [23]:
os.chdir(path_resultados)
df_word2vec_soma.to_excel("Word2VecSoma_TodosSinonimos_MS.xlsx")

## Word2Vec Média

In [24]:
def embedding_media(sentence, model, num_features, word_set):
    words = sentence.split()
    n_words = 0
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
            n_words = n_words + 1
    model[sentence] = feature_vec / n_words

In [25]:
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_media(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
0.224517287831163 %
0.449034575662326 %
0.673551863493489 %
0.898069151324652 %
1.1225864391558151 %
1.347103726986978 %
1.5716210148181409 %
1.796138302649304 %
2.020655590480467 %
2.2451728783116303 %
2.469690166142793 %
2.694207453973956 %
2.918724741805119 %
3.1432420296362817 %
3.3677593174674447 %
3.592276605298608 %
3.816793893129771 %
4.041311180960934 %
4.265828468792097 %
4.4903457566232605 %
4.7148630444544235 %
4.939380332285586 %
5.163897620116749 %
5.388414907947912 %
5.612932195779075 %
5.837449483610238 %
6.0619667714414005 %
6.2864840592725635 %
6.511001347103727 %
6.7355186349348894 %
6.960035922766053 %
7.184553210597216 %
7.409070498428378 %
7.633587786259542 %
7.858105074090704 %
8.082622361921867 %
8.307139649753031 %
8.531656937584193 %
8.756174225415357 %
8.980691513246521 %
9.205208801077683 %
9.429726088908847 %
9.654243376740009 %
9.878760664571171 %
10.103277952402335 %
10.327795240233497 %
10.552312528064661 %
10.776829815895823 %
11.001347103726987 %

91.15401885945218 %
91.37853614728334 %
91.6030534351145 %
91.82757072294567 %
92.05208801077683 %
92.27660529860799 %
92.50112258643915 %
92.72563987427031 %
92.95015716210148 %
93.17467444993265 %
93.39919173776381 %
93.62370902559498 %
93.84822631342612 %
94.07274360125729 %
94.29726088908846 %
94.52177817691963 %
94.74629546475079 %
94.97081275258195 %
95.19533004041311 %
95.41984732824427 %
95.64436461607544 %
95.8688819039066 %
96.09339919173776 %
96.31791647956892 %
96.5424337674001 %
96.76695105523126 %
96.99146834306241 %
97.21598563089357 %
97.44050291872475 %
97.66502020655591 %
97.88953749438707 %
98.11405478221823 %
98.3385720700494 %
98.56308935788056 %
98.78760664571172 %
99.01212393354288 %
99.23664122137404 %
99.4611585092052 %
99.68567579703638 %
99.91019308486754 %


In [26]:
contador = 0
for palavra_pof in df_word2vec_media.index:
    for palavra_snipc in df_word2vec_media.columns:
        df_word2vec_media.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_media.shape[0] * 100, "%")

0.30257186081694404 %
0.6051437216338881 %
0.9077155824508321 %
1.2102874432677762 %
1.5128593040847202 %
1.8154311649016641 %
2.118003025718608 %
2.4205748865355523 %
2.723146747352496 %
3.0257186081694405 %
3.3282904689863844 %
3.6308623298033282 %
3.9334341906202726 %
4.236006051437216 %
4.53857791225416 %
4.841149773071105 %
5.143721633888049 %
5.446293494704992 %
5.748865355521937 %
6.051437216338881 %
6.354009077155824 %
6.656580937972769 %
6.959152798789712 %
7.2617246596066565 %
7.564296520423601 %
7.866868381240545 %
8.169440242057489 %
8.472012102874432 %
8.774583963691377 %
9.07715582450832 %
9.379727685325264 %
9.68229954614221 %
9.984871406959153 %
10.287443267776098 %
10.59001512859304 %
10.892586989409985 %
11.195158850226928 %
11.497730711043873 %
11.800302571860817 %
12.102874432677762 %
12.405446293494705 %
12.708018154311649 %
13.010590015128592 %
13.313161875945537 %
13.61573373676248 %
13.918305597579424 %
14.22087745839637 %
14.523449319213313 %
14.826021180030258

In [27]:
os.chdir(path_resultados)
df_word2vec_media.to_excel("Word2VecMedia_TodosSinonimos_MS.xlsx")

In [28]:
contador

3305