In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast
from sklearn.pipeline import make_pipeline
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression


# specify GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [2]:
df = pd.read_table('sentiment labelled sentences/amazon_cells_labelled.txt')

In [3]:
df.columns = ['text','label']

In [4]:
df

Unnamed: 0,text,label
0,"Good case, Excellent value.",1
1,Great for the jawbone.,1
2,Tied to charger for conversations lasting more...,0
3,The mic is great.,1
4,I have to jiggle the plug to get it to line up...,0
...,...,...
994,The screen does get smudged easily because it ...,0
995,What a piece of junk.. I lose more calls on th...,0
996,Item Does Not Match Picture.,0
997,The only thing that disappoint me is the infra...,0


In [5]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3)

In [6]:
model = transformers.BertModel.from_pretrained("bert-base-uncased")
tokenizer =  transformers.BertTokenizer.from_pretrained('bert-base-uncased')

def transform(sentences):
    tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    # calculer la taille la plus grande, pour ensuite completer avec 
    # la difference pour avoir une taille unique maximale de taille max_len
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    # on rajoute le padding 
    tokenized_pad = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    input_ids = torch.tensor(tokenized_pad)
    # on detach le gradient
    with torch.no_grad(): 
        embeddings  = model(input_ids)
    return embeddings[0][:, 0, :].numpy()


Remarque:
Il existe un moyen de faire le padding avec un parametre du tokenizer, mais on se retrouve avec un array de sequence list, et pour retransformer le tout, ca prend beaucoup de temps et de memoire.

In [12]:
tokens = transform(x_train)

clf = LogisticRegression(max_iter=1000)
clf.fit(tokens, y_train)

LogisticRegression(max_iter=1000)

In [13]:
test_tokens =transform(x_test)

In [14]:
predicted = clf.predict(test_tokens)

In [15]:
clf.score(test_tokens,y_test)

0.91

In [16]:
print(classification_report(y_test,predicted))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       148
           1       0.93      0.89      0.91       152

    accuracy                           0.91       300
   macro avg       0.91      0.91      0.91       300
weighted avg       0.91      0.91      0.91       300

