# Importando Pacotes

In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
import strsimpy
import numpy as np
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.jaro_winkler import JaroWinkler
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

# Paths

In [2]:
path_bases = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Bases"
path_embeddings = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Embeddings"
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Resultados"

# Leitura de Bases

In [3]:
os.chdir(path_bases)
descricoes_pof = pd.read_excel("Descricoes_POF_g2.xlsx")
descricoes_snipc = pd.read_excel("Descricoes_SNIPC_g2.xlsx")
descricoes_pof = descricoes_pof.values.tolist() # Converte dataframe para lista de listas
descricoes_snipc = descricoes_snipc.values.tolist() # Coverte dataframe para lista de listas
descricoes_pof = [item for sublist in descricoes_pof for item in sublist] # Converte lista de listas para lista
descricoes_snipc = [item for sublist in descricoes_snipc for item in sublist] # Converte lista de listas para lista

# Criando Matrizes de Similaridade

In [4]:
## Levenshtein
df_levenshtein = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_levenshtein = pd.DataFrame(df_levenshtein, columns = descricoes_snipc,
index = descricoes_pof)

## Jaro
df_jaro = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaro = pd.DataFrame(df_jaro, columns = descricoes_snipc,
index = descricoes_pof)

## Jaccard
df_jaccard = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_jaccard = pd.DataFrame(df_jaccard, columns = descricoes_snipc,
index = descricoes_pof)

## TF-IDF
df_tfidf = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_tfidf = pd.DataFrame(df_tfidf, columns = descricoes_snipc,
index = descricoes_pof)

## Word2Vec
df_word2vec_soma = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_soma = pd.DataFrame(df_word2vec_soma, columns = descricoes_snipc,
index = descricoes_pof)
df_word2vec_media = np.zeros((len(descricoes_pof), len(descricoes_snipc)))
df_word2vec_media = pd.DataFrame(df_word2vec_media, columns = descricoes_snipc,
index = descricoes_pof)

# Calculando Similaridade

## Levenshtein

In [5]:
levenshtein = NormalizedLevenshtein()
contador = 0
for palavra_pof in df_levenshtein.index:
    for palavra_snipc in df_levenshtein.columns:
        df_levenshtein.at[palavra_pof, palavra_snipc] = levenshtein.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_levenshtein.shape[0] * 100, "%")

In [6]:
path_resultados = "C:\\Users\\aamma\\OneDrive\\ENCE\\TCC\\REFAZENDO EXPERIMENTO\\Resultados"
os.chdir(path_resultados)
df_levenshtein.to_excel("Levenshtein_g2_MS.xlsx")

## Jaro

In [7]:
jaro = JaroWinkler()
contador = 0
for palavra_pof in df_jaro.index:
    for palavra_snipc in df_jaro.columns:
        df_jaro.at[palavra_pof, palavra_snipc] = jaro.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_jaro.shape[0] * 100, "%")

In [8]:
os.chdir(path_resultados)
df_jaro.to_excel("Jaro_g2_MS.xlsx")

## Jaccard

In [9]:
contador = 0
for palavra_pof in df_jaccard.index:
    for palavra_snipc in df_jaccard.columns:
        tok_pof = set(palavra_pof.split())
        tok_snipc = set(palavra_snipc.split())
        numerador = tok_pof & tok_snipc
        denominador = tok_pof | tok_snipc
        df_jaccard.at[palavra_pof, palavra_snipc] = len(numerador) / len(denominador)
    contador = contador + 1
    if contador % 700 == 0 or contador == 3305:
        print(contador / df_jaccard.shape[0] * 100, "%")

In [10]:
os.chdir(path_resultados)
df_jaccard.to_excel("Jaccard_g2_MS.xlsx")

## TF-IDF

In [11]:
vectorizer = TfidfVectorizer()

# Juntando Todas as Descrições e Removendo duplicatas
todas_descricoes = descricoes_pof + descricoes_snipc
todas_descricoes = pd.Series(todas_descricoes).drop_duplicates().to_list()

# Calculando valores TF-IDF
tfidf_matrix = vectorizer.fit_transform(todas_descricoes)

# Transformando Matriz TF-IDF em DF
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=todas_descricoes, columns=vectorizer.get_feature_names_out())

# Filtrando por pesquisas
tfidf_snipc = tfidf_df.filter(items = descricoes_snipc, axis = 0)
tfidf_pof = tfidf_df.filter(items = descricoes_pof, axis = 0)

## Calculando Cosseno e transformando em DF
cosine_sim = cosine_similarity(tfidf_pof, tfidf_snipc)
df_tfidf = pd.DataFrame(cosine_sim, index = descricoes_pof, columns = descricoes_snipc)

In [12]:
os.chdir(path_resultados)
df_tfidf.to_excel("TFIDF_g2_MS.xlsx")

# Word2vec

In [13]:
os.chdir(path_embeddings)
os.getcwd()
model = KeyedVectors.load_word2vec_format("skip_s300.txt")

In [14]:
os.chdir(path_bases)
tokens_all = pd.read_excel("Tokens_All_g2.xlsx")
tokens_all = tokens_all.values.tolist()
tokens_all = [item for sublist in tokens_all for item in sublist]
vocab_set = set(model.index_to_key)
keep_set = set(tokens_all)
drop_set = vocab_set - keep_set
for word in drop_set:
    del model.key_to_index[word]

In [15]:
tokens_nc = pd.read_excel("Tokens_All_NC_g2.xlsx")
tokens_nc = tokens_nc.values.tolist()
tokens_nc = [item for sublist in tokens_nc for item in sublist]

In [16]:
tokens_nc

['baraticida',
 'bicama',
 'triliche',
 'sovadeira',
 'fogotini',
 'omeleteira',
 'ourinol',
 'marroada',
 'shampu',
 'descártaveis',
 'mp3',
 'mp4',
 'mp5',
 'desintupidor',
 'esperemedor']

In [17]:
matriz_aux = np.zeros((300, ), dtype = "float32")
for token in tokens_nc:
    model[token] = matriz_aux

## Soma de Embeddings

In [18]:
def embedding_soma(sentence, model, num_features, word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
    model[sentence] = feature_vec

In [19]:
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_soma(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
1.0351966873706004 %
2.070393374741201 %
3.1055900621118013 %
4.140786749482402 %
5.175983436853002 %
6.211180124223603 %
7.246376811594203 %
8.281573498964804 %
9.316770186335404 %
10.351966873706004 %
11.387163561076605 %
12.422360248447205 %
13.457556935817806 %
14.492753623188406 %
15.527950310559005 %
16.563146997929607 %
17.598343685300208 %
18.633540372670808 %
19.66873706004141 %
20.70393374741201 %
21.73913043478261 %
22.77432712215321 %
23.809523809523807 %
24.84472049689441 %
25.87991718426501 %
26.91511387163561 %
27.95031055900621 %
28.985507246376812 %
30.020703933747413 %
31.05590062111801 %
32.091097308488614 %
33.126293995859214 %
34.161490683229815 %
35.196687370600415 %
36.231884057971016 %
37.267080745341616 %
38.302277432712216 %
39.33747412008282 %
40.37267080745342 %
41.40786749482402 %
42.44306418219462 %
43.47826086956522 %
44.51345755693582 %
45.54865424430642 %
46.58385093167702 %
47.61904761904761 %
48.65424430641822 %
49.68944099378882 %
50.7246376811

In [20]:
contador = 0
for palavra_pof in df_word2vec_soma.index:
    for palavra_snipc in df_word2vec_soma.columns:
        df_word2vec_soma.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_soma.shape[0] * 100, "%")

1.4326647564469914 %
2.865329512893983 %
4.297994269340974 %
5.730659025787966 %
7.163323782234957 %
8.595988538681947 %
10.028653295128938 %
11.461318051575931 %
12.893982808022923 %
14.326647564469914 %
15.759312320916905 %
17.191977077363894 %
18.624641833810887 %
20.057306590257877 %
21.48997134670487 %
22.922636103151863 %
24.355300859598856 %
25.787965616045845 %
27.22063037249284 %
28.653295128939828 %
30.08595988538682 %
31.51862464183381 %
32.95128939828081 %
34.38395415472779 %
35.816618911174785 %
37.249283667621775 %
38.68194842406877 %
40.114613180515754 %
41.54727793696275 %
42.97994269340974 %
44.412607449856736 %
45.845272206303726 %
47.277936962750715 %
48.71060171919771 %
50.1432664756447 %
51.57593123209169 %
53.00859598853869 %
54.44126074498568 %
55.873925501432666 %
57.306590257879655 %
58.73925501432665 %
60.17191977077364 %
61.60458452722063 %
63.03724928366762 %
64.46991404011462 %
65.90257879656161 %
67.3352435530086 %
68.76790830945558 %
70.20057306590259 %
7

In [21]:
os.chdir(path_resultados)
df_word2vec_soma.to_excel("Word2VecSoma_g2_MS.xlsx")

## Word2Vec Média

In [22]:
def embedding_media(sentence, model, num_features, word_set):
    words = sentence.split()
    n_words = 0
    feature_vec = np.zeros((num_features, ), dtype = "float32")
    for word in words:
        if word in word_set:
            feature_vec = np.add(feature_vec, model[word])
            n_words = n_words + 1
    model[sentence] = feature_vec / n_words

In [23]:
word_set = set(model.index_to_key)
todas_descricoes = descricoes_pof + descricoes_snipc
contador = 0
for descricao in todas_descricoes:
    embedding_media(descricao, model, 300, word_set)
    if contador % 10 == 0 or contador == 4592:
        print(contador / len(todas_descricoes) * 100, "%")
    contador = contador + 1

0.0 %
1.0351966873706004 %
2.070393374741201 %
3.1055900621118013 %
4.140786749482402 %
5.175983436853002 %
6.211180124223603 %
7.246376811594203 %
8.281573498964804 %
9.316770186335404 %
10.351966873706004 %
11.387163561076605 %
12.422360248447205 %
13.457556935817806 %
14.492753623188406 %
15.527950310559005 %
16.563146997929607 %
17.598343685300208 %
18.633540372670808 %
19.66873706004141 %
20.70393374741201 %
21.73913043478261 %
22.77432712215321 %
23.809523809523807 %
24.84472049689441 %
25.87991718426501 %
26.91511387163561 %
27.95031055900621 %
28.985507246376812 %
30.020703933747413 %
31.05590062111801 %
32.091097308488614 %
33.126293995859214 %
34.161490683229815 %
35.196687370600415 %
36.231884057971016 %
37.267080745341616 %
38.302277432712216 %
39.33747412008282 %
40.37267080745342 %
41.40786749482402 %
42.44306418219462 %
43.47826086956522 %
44.51345755693582 %
45.54865424430642 %
46.58385093167702 %
47.61904761904761 %
48.65424430641822 %
49.68944099378882 %
50.7246376811

In [24]:
contador = 0
for palavra_pof in df_word2vec_media.index:
    for palavra_snipc in df_word2vec_media.columns:
        df_word2vec_media.at[palavra_pof, palavra_snipc] = model.similarity(palavra_pof, palavra_snipc)
    contador = contador + 1
    if contador % 10 == 0 or contador == 3376:
        print(contador / df_word2vec_media.shape[0] * 100, "%")

1.4326647564469914 %
2.865329512893983 %
4.297994269340974 %
5.730659025787966 %
7.163323782234957 %
8.595988538681947 %
10.028653295128938 %
11.461318051575931 %
12.893982808022923 %
14.326647564469914 %
15.759312320916905 %
17.191977077363894 %
18.624641833810887 %
20.057306590257877 %
21.48997134670487 %
22.922636103151863 %
24.355300859598856 %
25.787965616045845 %
27.22063037249284 %
28.653295128939828 %
30.08595988538682 %
31.51862464183381 %
32.95128939828081 %
34.38395415472779 %
35.816618911174785 %
37.249283667621775 %
38.68194842406877 %
40.114613180515754 %
41.54727793696275 %
42.97994269340974 %
44.412607449856736 %
45.845272206303726 %
47.277936962750715 %
48.71060171919771 %
50.1432664756447 %
51.57593123209169 %
53.00859598853869 %
54.44126074498568 %
55.873925501432666 %
57.306590257879655 %
58.73925501432665 %
60.17191977077364 %
61.60458452722063 %
63.03724928366762 %
64.46991404011462 %
65.90257879656161 %
67.3352435530086 %
68.76790830945558 %
70.20057306590259 %
7

In [25]:
os.chdir(path_resultados)
df_word2vec_media.to_excel("Word2VecMedia_g2_MS.xlsx")

In [26]:
contador

698