<a href="https://colab.research.google.com/github/carlos-dani-dev/carlos-dani-dev/blob/main/notebooks/Operacao_8_Vectorizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Monte o Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Download do dataset
!gdown --id 1_Elg5O_H0fJ1mIWEKkb6q4ul9x7SIhnk -O /content/drive/MyDrive/pln_data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Downloading...
From: https://drive.google.com/uc?id=1_Elg5O_H0fJ1mIWEKkb6q4ul9x7SIhnk
To: /content/drive/MyDrive/pln_data/b2w_reviews_samsung_cleaned.csv
100% 801k/801k [00:00<00:00, 39.4MB/s]


In [2]:
!pip install gensim
# Executar no início dos notebooks
!pip install gensim sentence-transformers
!pip install wordcloud plotly

# Para downloads do dataset
!pip install gdown



In [28]:
# Verificar uso de RAM
!cat /proc/meminfo | head -n 3

# Verificar GPU disponível
!nvidia-smi

MemTotal:       13289424 kB
MemFree:         7132144 kB
MemAvailable:   10344660 kB
/bin/bash: line 1: nvidia-smi: command not found


In [29]:
# Bibliotecas principais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Pré-processamento
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [30]:
# TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Word Embeddings
import gensim
from gensim.models import Word2Vec, Doc2Vec
from sentence_transformers import SentenceTransformer

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
reviews = pd.read_csv('/content/drive/MyDrive/PLN_B2W_Reviews/data/processed/b2w_reviews_samsung_modified.csv')
reviews

Unnamed: 0,review_text,overall_rating,recommend_to_a_friend
0,Uma tela impecável. Se sua prioridade é tela e...,4,1
1,Melhor custo benefício. Exatamente como anunc...,5,1
2,"Muito útil, para pesquisas e baixar publicaçõe...",5,1
3,A entrega sempre no prazo e muitas vezes até a...,4,1
4,"Recomendo tanto o produto quanto a loja, produ...",4,1
...,...,...,...
5748,O produto é muito bom. O designer é perfeito. ...,5,1
5749,"Excelente produto. Acima do esperado, Recomend...",5,1
5750,"Amei o produto muito bom ,chegou bem antes do ...",5,1
5751,"Qualidade de imagem e som são excelentes, além...",5,1


In [32]:
all_reviews = reviews['review_text']
all_reviews

Unnamed: 0,review_text
0,Uma tela impecável. Se sua prioridade é tela e...
1,Melhor custo benefício. Exatamente como anunc...
2,"Muito útil, para pesquisas e baixar publicaçõe..."
3,A entrega sempre no prazo e muitas vezes até a...
4,"Recomendo tanto o produto quanto a loja, produ..."
...,...
5748,O produto é muito bom. O designer é perfeito. ...
5749,"Excelente produto. Acima do esperado, Recomend..."
5750,"Amei o produto muito bom ,chegou bem antes do ..."
5751,"Qualidade de imagem e som são excelentes, além..."


In [26]:
count_vect = CountVectorizer()
'''
count_vect = CountVectorizer(
    stop_words="portuguese",
    min_df=2,
    max_df=0.8
)
'''
X_count = count_vect.fit_transform(all_reviews)
X_count = X_count.toarray()
'''
print("Matriz CountVectorizer (bag of words):")
print(X_count)  # matriz densa com cada review vetorizada
print("Vocabulário (features):")
print(count_vect.get_feature_names_out())
'''
df = pd.DataFrame(X_count, columns=count_vect.get_feature_names_out())
print(df)

      00  000  01  015  02  03  04  05  06  07  ...  ùnico  última  último  \
0      0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
1      0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
2      0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
3      0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
4      0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
...   ..  ...  ..  ...  ..  ..  ..  ..  ..  ..  ...    ...     ...     ...   
5748   0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
5749   0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
5750   0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
5751   0    0   0    0   0   0   0   0   0   0  ...      0       0       0   
5752   0    0   0    0   0   0   0   0   0   0  ...      0       0       0   

      últimos  úmida  única  únicas  único  úteis  útil  
0    

In [27]:
tfidf_vect = TfidfVectorizer()
'''
tfidf_vect = TfidfVectorizer(
    stop_words="portuguese",
    min_df=2,
    max_df=0.8
)
'''
X_tfidf = tfidf_vect.fit_transform(all_reviews)
X_tfidf = X_tfidf.toarray()

'''
print("\nMatriz TF-IDF:")
print(X_tfidf.toarray())
print("Vocabulário (features):")
print(tfidf_vect.get_feature_names_out())
'''
df = pd.DataFrame(X_tfidf, columns=tfidf_vect.get_feature_names_out())
print(df)

       00  000   01  015   02   03   04   05   06   07  ...  ùnico  última  \
0     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
1     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
2     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
3     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
4     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...    ...     ...   
5748  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
5749  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
5750  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
5751  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   
5752  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0     0.0   

      último  últimos  úmida  única  únicas  único  úteis      

In [36]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [38]:
reviews_tokens = [r.split() for r in all_reviews]
reviews_tokens

[['Uma',
  'tela',
  'impecável.',
  'Se',
  'sua',
  'prioridade',
  'é',
  'tela',
  'e',
  'câmera,',
  'esse',
  'é',
  'o',
  'cara.',
  'O',
  'ponto',
  'fraco',
  'fica',
  'por',
  'conta',
  'do',
  'armazenamento,',
  'apenas',
  '16GB,',
  'e',
  'pela',
  'câmera',
  'frontal,',
  'de',
  'apenas',
  '5Mpixels',
  'sem',
  'flash.',
  'No',
  'entanto,',
  'a',
  'câmera',
  'principal',
  'de',
  '13Mpixels',
  'é',
  'perfeita.'],
 ['Melhor',
  'custo',
  'benefício.',
  'Exatamente',
  'como',
  'anunciado.',
  'Recomendo',
  '.'],
 ['Muito',
  'útil,',
  'para',
  'pesquisas',
  'e',
  'baixar',
  'publicações',
  'e',
  'vídeos,',
  'usando',
  'o',
  'app',
  'jw',
  'library.',
  'Parabéns!'],
 ['A',
  'entrega',
  'sempre',
  'no',
  'prazo',
  'e',
  'muitas',
  'vezes',
  'até',
  'antes.',
  'O',
  'produto',
  'com',
  'um',
  'custo',
  'benefício',
  'muito',
  'bom.',
  'Recomendo',
  'tanto',
  'o',
  'site',
  'quanto',
  'o',
  'produto.'],
 ['Recomendo',

In [39]:
tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(reviews_tokens)]

# Treinando o Doc2Vec
model_doc2vec = Doc2Vec(
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    epochs=20,
    dm=1
)

model_doc2vec.build_vocab(tagged_data)
model_doc2vec.train(tagged_data, total_examples=model_doc2vec.corpus_count, epochs=model_doc2vec.epochs)

In [40]:
vectors_doc2vec = [model_doc2vec.dv[str(i)] for i in range(len(reviews))]

print("Embedding do primeiro review:", vectors_doc2vec[0])

Embedding do primeiro review: [ 0.0568716  -0.00213303  0.22847192 -0.09440938 -0.10316108  0.02458283
  0.06794484  0.10001874 -0.05416399  0.05485205  0.20392825 -0.23299882
  0.19295175 -0.26137835 -0.0601193  -0.06035749 -0.02534163 -0.15174112
  0.13499539  0.12208906  0.00903674 -0.18118437  0.08767585 -0.02879934
 -0.14330311  0.11594518  0.1325343  -0.05863864 -0.07962441 -0.18684584
 -0.00503118  0.15673508 -0.13318622 -0.16216925  0.002423   -0.14078967
 -0.14062832 -0.00072987 -0.19160207 -0.01077004  0.08074228  0.11386324
 -0.00493257  0.05837388 -0.08879232 -0.12210812 -0.13682203  0.14269635
  0.04143201 -0.13275674  0.09780004  0.2088471  -0.0694993  -0.1711469
  0.17192976 -0.19675392 -0.05227141  0.17637295  0.13434148 -0.13629429
 -0.14829144 -0.04312493 -0.12379533  0.01763203 -0.21263094  0.01047457
  0.16641839  0.14880452 -0.14652579 -0.01669189  0.05133184  0.09221348
 -0.27117932  0.16067795 -0.05249866  0.15470357 -0.11399572 -0.18923903
 -0.17740457  0.235361

In [41]:
from sentence_transformers import SentenceTransformer

In [42]:
model_sbert = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [45]:
vectors_sbert = model_sbert.encode(all_reviews)

print("Dimensão dos embeddings SBERT:", vectors_sbert.shape)
print("Embedding do primeiro review:", vectors_sbert[0])

Dimensão dos embeddings SBERT: (5753, 384)
Embedding do primeiro review: [ 0.08199719  0.16245247 -0.00396649 -0.08919409  0.4117052  -0.15496434
 -0.0201785   0.10905787  0.23906264  0.00617858  0.0913043  -0.06408727
  0.21467522  0.13031565 -0.15362039 -0.14905216  0.3078782  -0.16238216
 -0.08420039  0.24038944  0.00123922 -0.48313272  0.2069793  -0.21068874
  0.09793292  0.0121033  -0.08144653 -0.13229631 -0.07648881 -0.31643838
  0.02896918  0.12536278  0.4107051  -0.04095189  0.05685949 -0.10103796
 -0.11654656 -0.21766287 -0.12755494 -0.1335192  -0.10608496  0.2346628
  0.14204168  0.03057055  0.21936306 -0.02349702 -0.22156864  0.03316034
  0.01743152 -0.02981755 -0.17637911  0.02810703 -0.08774266  0.23701732
 -0.03288361  0.06951696  0.01687621 -0.12618518  0.04875147 -0.04671011
  0.2807747  -0.06237713 -0.01509817 -0.13089861  0.10479917 -0.08030988
 -0.14809878 -0.20986429 -0.11360933  0.33300906 -0.03322508  0.05231756
 -0.14719893 -0.07598647  0.19201177 -0.35069862  0.