In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [2]:
import pandas as pd
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from typing import List
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

In [4]:
input_filename = os.getcwd().split('/examples/')[0]+'/data/initial_data/items_titles.csv'

df_titles = pd.read_csv(input_filename)

In [5]:
df_titles.tail()

Unnamed: 0,ITE_ITEM_TITLE
29995,Tênis Vans Old Skool I Love My Vans - Usado - Feminino
29996,Tênis Feminino Preto Moleca 5296155
29997,Tenis Botinha Com Pelo Via Marte Original Lançamento
29998,Tênis Slip On Feminino Masculino Original Sapato Xadrez Mule
29999,Bicicleta Nathor Rosa Infantil Sem Pedal Balance Aro 12


In [6]:
list_stopwords = stopwords.words('portuguese')


In [7]:
df_titles['ITE_ITEM_TITLE'] = df_titles['ITE_ITEM_TITLE'].str.lower()

In [8]:
def remove_stopwords(xs: str, list_stopwords: List) -> str:
    xs = re.sub(r'[^\w\s]', '', xs)
    ys = ''
    for a_word in xs.split(' '):
        if a_word not in list_stopwords:
            ys = ys + ' ' + a_word
    return ys

In [9]:
df_titles['ITE_ITEM_TITLE_PREPROC'] = df_titles['ITE_ITEM_TITLE']\
                                            .apply(lambda xs: remove_stopwords(xs, list_stopwords))

In [10]:
df_titles.head()

Unnamed: 0,ITE_ITEM_TITLE,ITE_ITEM_TITLE_PREPROC
0,tênis ascension posh masculino - preto e vermelho,tênis ascension posh masculino preto vermelho
1,tenis para caminhada super levinho spider corrida,tenis caminhada super levinho spider corrida
2,tênis feminino le parc hocks black/ice original envio já,tênis feminino le parc hocks blackice original envio
3,tênis olympikus esportivo academia nova tendência triunfo,tênis olympikus esportivo academia nova tendência triunfo
4,inteligente led bicicleta tauda luz usb bicicleta carregáve,inteligente led bicicleta tauda luz usb bicicleta carregáve


In [11]:
full_corpus = ''.join(df_titles['ITE_ITEM_TITLE_PREPROC'].tolist())

In [12]:
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=0.0001)
X_features = vectorizer.fit_transform(df_titles['ITE_ITEM_TITLE_PREPROC'])

In [13]:
# maximum vocabulary: 85828
len(vectorizer.vocabulary_)

13612

In [19]:
vec_i = X_features[0,:].toarray()
vec_j = X_features[0+1:,:].toarray()

In [20]:
vec_j.shape

(29999, 13612)

In [25]:
cosine_similarity(vec_i, vec_j[1:2,:])

array([[0.11785113]])

In [21]:
cosine_similarity(vec_i, vec_j)

array([[0.        , 0.11785113, 0.10660036, ..., 0.        , 0.18898224,
        0.        ]])

In [22]:
cosine_similarity(vec_i, vec_j).shape

(1, 29999)

In [None]:
similarity_dict = {}
for i in tqdm(range(df_titles.shape[0])):
    vec_i = X_features[i,:].toarray()
    vec_j = X_features[i+1:,:].toarray()
    similarity_dict[i] = cosine_similarity(vec_i, vec_j)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [31]:
max(list(similarity_dict.keys()))

1556

In [36]:
import pickle

with open('pairwise_similarity_v2_1.pkl', 'wb') as handle:
    pickle.dump(similarity_dict, handle)

In [None]:
for i in tqdm(range(3878, df_titles.shape[0])):
    vec_i = X_features[i,:].toarray()
    vec_j = X_features[i+1:,:].toarray()
    similarity_dict[i] = cosine_similarity(vec_i, vec_j)

  0%|          | 0/26122 [00:00<?, ?it/s]

In [51]:
df_bow = pd.DataFrame(X_features.toarray(), columns=vectorizer.get_feature_names_out())

In [59]:
df_bow.sum(axis=1).value_counts().reset_index().sort_values('index')

Unnamed: 0,index,0
19,0,26
16,1,180
14,2,369
12,3,686
10,4,1118
8,5,1658
7,6,1940
5,7,2665
4,8,2785
1,9,3240


In [52]:
df_bow.head()

Unnamed: 0,0007,001,002,003,01,01 preto,01ac,02,02 pares,02 tênis,...,öus imigrante,öus naccarato,öus phibo,öus skate,últimas,últimas peças,últimas unidades,último,única,único
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
X_features.shape

(30000, 13612)

In [None]:
similarity_dict = {}
for i in tqdm(range(df_titles.shape[0])):
    vec_i = X_features[i,:].toarray()
    for j in tqdm(range(df_titles.shape[0])):
        if j > i:
            vec_j = X_features[j,:].toarray()
            similarity_dict[(i,j)] = cosine_similarity(vec_i, vec_j)[0][0]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

In [None]:
import pickle

with open('pairwise_similarity.pkl', 'wb') as handle:
    pickle.dump(similarity_dict, handle)

In [35]:
len(similarity_dict)

3878