In [1]:
import os
import re
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertModel, DistilBertTokenizer, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error, r2_score, precision_recall_curve
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
def load_imdb_data(data_dir):
    data = []
    sentiments = []
    ratings = []
    for label_type in ['pos', 'neg']:
        dir_name = os.path.join(data_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith(".txt"):
                rating = int(re.search(r'_(\d+)\.txt', fname).group(1))
                with open(os.path.join(dir_name, fname), 'r', encoding='utf-8') as f:
                    data.append(f.read())
                    sentiments.append(1 if label_type == 'pos' else 0)
                    ratings.append(rating)
    return pd.DataFrame({'review': data, 'sentiment': sentiments, 'rating': ratings})

train_data = load_imdb_data('aclImdb/train')
test_data = load_imdb_data('aclImdb/test')

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def prepare_data(df, tokenizer, max_length=256):
    encodings = tokenizer(df['review'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return TensorDataset(
        encodings['input_ids'], 
        encodings['attention_mask'], 
        torch.tensor(df['sentiment'].tolist(), dtype=torch.long),
        torch.tensor(df['rating'].tolist(), dtype=torch.float)
    )

train_dataset = prepare_data(train_data, tokenizer)
test_dataset = prepare_data(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)



In [4]:
class DistilBertMultitaskModel(nn.Module):
    def __init__(self, num_labels=2):
        super(DistilBertMultitaskModel, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.sentiment_classifier = nn.Linear(self.distilbert.config.hidden_size, num_labels)
        self.rating_regressor = nn.Linear(self.distilbert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        sentiment_logits = self.sentiment_classifier(pooled_output)
        rating = self.rating_regressor(pooled_output)
        return sentiment_logits, rating

model = DistilBertMultitaskModel()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
sentiment_loss_fn = nn.CrossEntropyLoss()
rating_loss_fn = nn.MSELoss()

def train(model, train_loader, optimizer, sentiment_loss_fn, rating_loss_fn, device, epochs=3):
    model.train()
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}')
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids, attention_mask, sentiment_labels, rating_labels = [b.to(device) for b in batch]
            sentiment_logits, rating_pred = model(input_ids, attention_mask)
            sentiment_loss = sentiment_loss_fn(sentiment_logits, sentiment_labels)
            rating_loss = rating_loss_fn(rating_pred.squeeze(), rating_labels)
            loss = sentiment_loss + rating_loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

def evaluate(model, test_loader, device):
    model.eval()
    sentiment_predictions = []
    rating_predictions = []
    true_sentiments = []
    true_ratings = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Evaluating'):
            input_ids, attention_mask, sentiment_labels, rating_labels = [b.to(device) for b in batch]
            sentiment_logits, rating_pred = model(input_ids, attention_mask)
            sentiment_preds = torch.argmax(sentiment_logits, dim=1)
            sentiment_predictions.extend(sentiment_preds.cpu().tolist())
            rating_predictions.extend(rating_pred.squeeze().cpu().tolist())
            true_sentiments.extend(sentiment_labels.cpu().tolist())
            true_ratings.extend(rating_labels.cpu().tolist())
    return sentiment_predictions, rating_predictions, true_sentiments, true_ratings

def calculate_metrics(sentiment_predictions, rating_predictions, true_sentiments, true_ratings):
    accuracy = accuracy_score(true_sentiments, sentiment_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_sentiments, sentiment_predictions, average='binary')
    
    mse = mean_squared_error(true_ratings, rating_predictions)
    rmse = mse ** 0.5
    r2 = r2_score(true_ratings, rating_predictions)
    
    return {
        'sentiment_accuracy': accuracy,
        'sentiment_precision': precision,
        'sentiment_recall': recall,
        'sentiment_f1': f1,
        'rating_mse': mse,
        'rating_rmse': rmse,
        'rating_r2': r2
    }

print("Training the model...")
train(model, train_loader, optimizer, sentiment_loss_fn, rating_loss_fn, device)

print("\nEvaluating the model...")
sentiment_predictions, rating_predictions, true_sentiments, true_ratings = evaluate(model, test_loader, device)

print("\nCalculating metrics...")
metrics = calculate_metrics(sentiment_predictions, rating_predictions, true_sentiments, true_ratings)

print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

Training the model...


Epoch 1: 100%|██████████| 782/782 [05:31<00:00,  2.36it/s, loss=5.24]
Epoch 2: 100%|██████████| 782/782 [05:36<00:00,  2.32it/s, loss=2.81]
Epoch 3: 100%|██████████| 782/782 [05:35<00:00,  2.33it/s, loss=2.04]



Evaluating the model...


Evaluating: 100%|██████████| 782/782 [01:53<00:00,  6.87it/s]


Calculating metrics...

Model Performance Metrics:
sentiment_accuracy: 0.9101
sentiment_precision: 0.9078
sentiment_recall: 0.9130
sentiment_f1: 0.9104
rating_mse: 3.0796
rating_rmse: 1.7549
rating_r2: 0.7473





In [7]:
torch.save(model.state_dict(), 'film_review_predict_distilbertbert.pth')
tokenizer.save_pretrained('model/tokenizer')

('model/tokenizer\\tokenizer_config.json',
 'model/tokenizer\\special_tokens_map.json',
 'model/tokenizer\\vocab.txt',
 'model/tokenizer\\added_tokens.json')

In [None]:
def predict_sentiment_and_rating(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
    encoding = {k: v.to(device) for k, v in encoding.items()}
    with torch.no_grad():
        sentiment_logits, rating_pred = model(**encoding)
    sentiment_probs = torch.softmax(sentiment_logits, dim=1)
    sentiment = "Positive" if sentiment_probs[0][1] > 0.5 else "Negative"
    sentiment_confidence = sentiment_probs[0][1].item() if sentiment == "Positive" else sentiment_probs[0][0].item()
    rating = rating_pred.squeeze().item()
    return sentiment, sentiment_confidence, rating

while True:
    user_review = input("\nEnter a movie review (or type 'quit' to exit): ")
    if user_review.lower() == 'quit':
        break
    sentiment, confidence, rating = predict_sentiment_and_rating(user_review, model, tokenizer, device)
    print(f"Predicted sentiment: {sentiment}")
    print(f"Sentiment confidence: {confidence:.2f}")
    print(f"Predicted rating: {rating:.1f}/10")