In [1]:
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

In [2]:
import nltk
nltk.download(['punkt', 'stopwords'])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
df = pd.read_csv("selected_data.csv")

In [7]:
df.head()

Unnamed: 0,overall,reviewText
0,2,I have an older URC-WR7 remote and thought thi...
1,5,First time I've EVER had a remote that needed ...
2,4,Got them and only 2 of them worked. company ca...
3,5,I got tired of the remote being on the wrong s...
4,5,After purchasing cheap cords from another webs...


In [8]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation and special characters
    tokens = [word for word in tokens if word.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [9]:
df_preprocessed = df.copy()
df_preprocessed['reviewText'] = df_preprocessed['reviewText'].apply(preprocess_text)
df_preprocessed.to_csv('data_preprocessed.csv', index=False)


In [10]:
#Split to Train and Validation
from sklearn.model_selection import train_test_split
df_preprocessed = pd.read_csv('data_preprocessed.csv')
train_data, valid_data = train_test_split(df_preprocessed, test_size=0.2, random_state=42)
train_data.to_csv('train_data.csv', index=False)
valid_data.to_csv('valid_data.csv', index=False)

In [12]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
Col

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch
from sklearn.metrics import accuracy_score
from torch import nn
from torch.optim import Adam

In [14]:
train_data = pd.read_csv('train_data.csv')
valid_data = pd.read_csv('valid_data.csv')

In [20]:
train_data.head()

Unnamed: 0,overall,reviewText
0,4,dual monitor pc one dvi one vga port adapt per...
1,4,excel stabil 18 power binocular cheap worth pr...
2,1,first time order len receiv wrong len wrong si...
3,5,ok fed new modern electron devic everi singl b...
4,2,unit good deal work 2 major complaint go back ...


In [21]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': str(self.texts.iloc[idx]), 'label': int(self.labels.iloc[idx])}

In [16]:
train_dataset = CustomDataset(train_data['reviewText'], train_data['overall'])
valid_dataset = CustomDataset(valid_data['reviewText'], valid_data['overall'])

In [22]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(train_dataset.texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_dataset.texts), truncation=True, padding=True)

In [23]:
# Convert data to PyTorch tensors
train_labels = torch.tensor(list(train_dataset.labels))
valid_labels = torch.tensor(list(valid_dataset.labels))

In [24]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [25]:
train_dataset = CustomDataset(train_encodings, train_labels)
valid_dataset = CustomDataset(valid_encodings, valid_labels)

In [26]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()



In [28]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [32]:
epochs = 4

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}', position=0, leave=True):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Adjust labels to be in the range 0 to 5
        labels = labels - 1

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f'Average training loss for Epoch {epoch + 1}: {avg_loss}')

    # Validation loop
    model.eval()
    val_preds = []
    val_true = []

    with torch.no_grad():
        for batch in tqdm(valid_loader, desc=f'Validation', position=0, leave=True):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Adjust labels to be in the range 0 to 5
            labels = labels - 1

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true, val_preds)
    print(f'Validation Accuracy for Epoch {epoch + 1}: {val_accuracy}')

  item['labels'] = torch.tensor(self.labels[idx])
Epoch 1: 100%|██████████| 773/773 [04:55<00:00,  2.62it/s]


Average training loss for Epoch 1: 0.6826124079766434


Validation: 100%|██████████| 194/194 [00:26<00:00,  7.34it/s]


Validation Accuracy for Epoch 1: 0.6168284789644013


  item['labels'] = torch.tensor(self.labels[idx])
Epoch 2: 100%|██████████| 773/773 [04:54<00:00,  2.63it/s]


Average training loss for Epoch 2: 0.4679629945570909


Validation: 100%|██████████| 194/194 [00:26<00:00,  7.32it/s]


Validation Accuracy for Epoch 2: 0.6362459546925566


  item['labels'] = torch.tensor(self.labels[idx])
Epoch 3: 100%|██████████| 773/773 [04:54<00:00,  2.62it/s]


Average training loss for Epoch 3: 0.3322795400104424


Validation: 100%|██████████| 194/194 [00:26<00:00,  7.36it/s]


Validation Accuracy for Epoch 3: 0.626537216828479


  item['labels'] = torch.tensor(self.labels[idx])
Epoch 4: 100%|██████████| 773/773 [04:54<00:00,  2.63it/s]


Average training loss for Epoch 4: 0.25761845928495136


Validation: 100%|██████████| 194/194 [00:26<00:00,  7.32it/s]

Validation Accuracy for Epoch 4: 0.6233009708737864





In [33]:
from sklearn.metrics import f1_score, precision_score, recall_score

val_f1_micro = f1_score(val_true, val_preds, average='micro')
val_precision_micro = precision_score(val_true, val_preds, average='micro')
val_recall_micro = recall_score(val_true, val_preds, average='micro')

print(f'Validation F1 Score (Micro): {val_f1_micro}')
print(f'Validation Precision (Micro): {val_precision_micro}')
print(f'Validation Recall (Micro): {val_recall_micro}')

Validation F1 Score (Micro): 0.6233009708737864
Validation Precision (Micro): 0.6233009708737864
Validation Recall (Micro): 0.6233009708737864


In [34]:
import matplotlib.pyplot as plt

plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, epochs + 1), valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


NameError: ignored