Amazon review data

https://www.kaggle.com/datasets/bittlingmayer/amazonreviews

In [5]:
import bz2
from pathlib import Path

import numpy as np
import pandas as pd


def get_data(file_name: Path):
    reviews = bz2.BZ2File(file_name).readlines()
    reviews = [review.decode("utf-8") for review in reviews]
    target = {'1':'Negative', '2':'Positive'}
    
    label = [target[label[9]] for label in reviews]
    reviews = [review[11:] for review in reviews]
    df = pd.DataFrame(data = {"label":label, "review": reviews})
    return df

In [6]:
path = Path("../../data")
train = get_data(path / "train.ft.txt.bz2").sample(140000, random_state=0)
test = get_data(path / "test.ft.txt.bz2")

In [7]:
test.head()

Unnamed: 0,label,review
0,Positive,Great CD: My lovely Pat has one of the GREAT v...
1,Positive,One of the best game music soundtracks - for a...
2,Negative,Batteries died within a year ...: I bought thi...
3,Positive,"works fine, but Maha Energy is better: Check o..."
4,Positive,Great for the non-audiophile: Reviewed quite a...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [15]:
vec = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=3, 
    max_df=0.9, 
    strip_accents='unicode', 
    use_idf=True,
    smooth_idf=True, 
    sublinear_tf=True
)

encoder = LabelEncoder()

In [16]:
X_train = vec.fit_transform(train['review'])
X_test = vec.transform(test['review'])
Y_train = encoder.fit_transform(train['label'])
Y_test = encoder.transform(test['label'])

In [17]:
log_model = LogisticRegression(
    C=4, dual=True, solver='liblinear', random_state=42
)
log_model.fit(X_train, Y_train)

In [18]:
from sklearn.metrics import classification_report

y_pred = log_model.predict(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92    200000
           1       0.92      0.92      0.92    200000

    accuracy                           0.92    400000
   macro avg       0.92      0.92      0.92    400000
weighted avg       0.92      0.92      0.92    400000



In [34]:
weight = pd.Series(log_model.coef_[0, :], index=vec.get_feature_names_out()).sort_values()

In [37]:
weight.head(10)

not              -20.348428
disappointing    -16.972698
disappointed     -16.215576
not worth        -15.808484
worst            -15.705498
boring           -15.428859
poor             -15.214610
disappointment   -14.966582
terrible         -12.943469
horrible         -12.828058
dtype: float64

In [38]:
weight.tail(10)

better than     9.679964
wonderful      10.604837
love           10.793873
best           11.316830
amazing        11.461895
good           11.688614
awesome        12.913146
perfect        13.743777
excellent      18.595586
great          22.434792
dtype: float64

## BERT

In [39]:
from typing import List, Mapping

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoConfig, AutoModel, AutoTokenizer

device = torch.device("cpu")
MODEL_NAME = 'distilbert-base-uncased'
NUM_EPOCHS = 1
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 100

class ReviewDataset(Dataset):

    def __init__(
        self,
        sentences: List[str],
        labels: List[str] = None,
        max_seq_length: int = MAX_SEQ_LENGTH,
        model_name: str = 'distilbert-base-uncased'
    ):

        self.sentences = sentences
        self.labels = labels
        self.max_seq_length = max_seq_length

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        
    def __len__(self):
        return len(self.sentences)

    
    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        sentence = self.sentences[index]
        encoded = self.tokenizer.encode_plus(
            sentence, 
            add_special_tokens=True, 
            padding="max_length", 
            max_length=self.max_seq_length,
            truncation=True,
            return_token_type_ids=True
        )
        
        output = {
            'input_ids': torch.tensor(encoded['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoded['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(self.labels[index], dtype=torch.long)
        }
        
        return output

In [40]:
train = train.sample(1000)
test = test.sample(1000)

train_dataset = ReviewDataset(
    sentences=train['review'].values.tolist(),
    labels=train['label'].map({"Negative": 0, "Positive": 1}).values,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

valid_dataset = ReviewDataset(
    sentences=test['review'].values.tolist(),
    labels=test['label'].map({"Negative": 0, "Positive": 1}).values,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

In [41]:
training_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=2
)

valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=2,
)

In [42]:
class DistilBert(nn.Module):

    def __init__(self, pretrained_model_name: str = MODEL_NAME, num_classes: int = 2):

        super().__init__()

        config = AutoConfig.from_pretrained(
             pretrained_model_name
        )
        self.distilbert = AutoModel.from_pretrained(
            pretrained_model_name,
            config=config
        )
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, num_classes)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

    def forward(self, input_ids, attention_mask):

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        hidden_state = distilbert_output[0] 
        pooled_output = hidden_state[:, 0] 
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [43]:
model = DistilBert()

optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=LEARNING_RATE
)

In [44]:
model.train()

for epoch in range(NUM_EPOCHS):
    for step, data in enumerate(training_loader):
        
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask)

        optimizer.zero_grad()

        loss = F.nll_loss(outputs, targets, weight=torch.tensor([1., 1.]).to(device))
        print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch: 0, Loss:  0.005630552768707275
Epoch: 0, Loss:  -0.22734108567237854
Epoch: 0, Loss:  -0.48337310552597046
Epoch: 0, Loss:  -0.7381134033203125
Epoch: 0, Loss:  -1.0398310422897339
Epoch: 0, Loss:  -1.2750431299209595
Epoch: 0, Loss:  -1.5029046535491943
Epoch: 0, Loss:  -1.731217861175537
Epoch: 0, Loss:  -1.8920376300811768
Epoch: 0, Loss:  -2.0423271656036377


In [45]:
model.eval()

labels = []
confidences = []
probs = []
with torch.no_grad():
    for data in valid_loader:
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)

        outputs = model(ids, mask)
        prob = outputs.data.exp()
        confidence, idx = torch.max(prob, dim=-1)

        labels.append(idx)
        confidences.append(confidence.tolist())
        probs.append(prob.tolist())

labels = np.concatenate(labels, axis=0)
confidences = np.concatenate(confidences, axis=0)
probs = np.concatenate(probs, axis=0)