In [44]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

def prepare_data(texts, labels, max_length=512):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('imdb_dataset.csv', engine = 'python')

In [33]:
from sklearn.model_selection import train_test_split

class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(
            reviews.tolist(),
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        self.labels = torch.tensor([1 if label == 'positive' else 0 for label in labels])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

def train_model(train_dataloader, model, epochs=6):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_dataloader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}')

# Main execution
def prepare_and_train(df, test_size=0.2, batch_size=32):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )


    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['review'].values,
        df['sentiment'].values,
        test_size=test_size
    )


    train_dataset = ReviewDataset(train_texts, train_labels, tokenizer)
    val_dataset = ReviewDataset(val_texts, val_labels, tokenizer)


    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size
    )


    train_model(train_dataloader, model)

    return model, tokenizer, val_dataloader



In [34]:
# Can only use 3000 rows because of techinical and time constraints. (RAM memory overload)
model, tokenizer, val_dataloader  = prepare_and_train(df[0:3000])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Average Loss: 0.5144
Epoch 2, Average Loss: 0.2122
Epoch 3, Average Loss: 0.1171
Epoch 4, Average Loss: 0.0461
Epoch 5, Average Loss: 0.0309
Epoch 6, Average Loss: 0.0247


In [36]:
# prompt: Write a function to evaluate metrics of the model

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_model(model, dataloader, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(predicted_labels)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

# Example usage (assuming you have val_dataloader from prepare_and_train)
evaluate_model(model, val_dataloader, tokenizer)


Accuracy: 0.8950
Precision: 0.8960
Recall: 0.8950
F1-score: 0.8950


In [28]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')




In [None]:
def compute(data):
  tokenized = tokenizer(
            data,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  tokenized = {k: v.to(device) for k, v in tokenized.items()}

  model.eval()

  with torch.no_grad():  # Disable gradient calculations during inference
      outputs = model(**tokenized)  # Pass tokenized input to the model
      logits = outputs.logits  # Get raw model outputs (logits)
      predicted_class = torch.argmax(logits, dim=1)  # Get predicted class index

  print('Positive' if predicted_class.tolist()[0] else 'Negative')  # Print the predicted class index (0 or 1)



In [39]:
while True:

  compute(input())

I love this movie!
Positive
I hate this movie!
Negative
I recommend everybody to watch this movie!
Positive
This movie sucks, don't watch it!
Negative


KeyboardInterrupt: Interrupted by user

In [42]:
import os

# Save the model weights and configuration
def save_model(model, tokenizer, output_dir='sentiment_model'):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save model weights and configuration
    model.save_pretrained(output_dir)

    # Save tokenizer
    tokenizer.save_pretrained(output_dir)

# Load the model
def load_model(model_dir='sentiment_model'):
    model = DistilBertForSequenceClassification.from_pretrained(model_dir)
    tokenizer = DistilBertTokenizer.from_pretrained(model_dir)
    return model, tokenizer

In [43]:
save_model(model, tokenizer, 'imdb_sentiment_model')

# model, tokenizer = load_model('imdb_sentiment_model')