In [90]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
import transformers as ppb
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Carregando dataset
df = pd.read_csv('../datasets/buscape_preprocessed_balanced.csv')

In [4]:
# Batch com 2000 instâncias para melhor performance
batch_1 = df[:1000]

In [5]:
batch_1['rating'].value_counts()

rating
1    906
0     94
Name: count, dtype: int64

In [45]:
# Carregando distribuição bert
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Carregando modelo pré-treinado e tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [61]:
# Tokenizando
#tokenized = batch_1['review_text'].apply((lambda x: tokenizer.encode(x, padding=True, return_tensors='pt', truncation=True, max_length=50, add_special_tokens=True)))
tokenized = batch_1['review_text'].apply((lambda x: tokenizer.encode(x, truncation=True, max_length=50, add_special_tokens=True)))

In [64]:
# Padding
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
print(max_len)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

50


In [65]:
# Masking
attention_mask = np.where(padded != 0, 1, 0)

In [66]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

In [67]:
input_ids.shape

torch.Size([1000, 50])

In [68]:
attention_mask.shape

torch.Size([1000, 50])

In [69]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [72]:
features = last_hidden_states[0][:,0,:].numpy()

In [74]:
features.shape

(1000, 768)

In [75]:

labels = batch_1['rating']

In [76]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [82]:
lr_clf.score(test_features, test_labels)

0.904

In [86]:
clf = DummyClassifier()
scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.907 (+/- 0.00)


In [None]:
# Referências:
# https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb