In [14]:
!pip install unidecode transformers tensorflow-addons
!pip uninstall -y torch torchvision torchaudio transformers
!pip install torch
!pip install transformers==4.30.2
!pip install unidecode datasets

  pid, fd = os.forkpty()


Found existing installation: torch 2.4.1
Uninstalling torch-2.4.1:
  Successfully uninstalled torch-2.4.1
[0mFound existing installation: transformers 4.30.2
Uninstalling transformers-4.30.2:
  Successfully uninstalled transformers-4.30.2
Collecting torch
  Using cached torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Using cached torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl (797.1 MB)
Installing collected packages: torch
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
easyocr 1.7.2 requires torchvision>=0.5, which is not installed.
fastai 2.7.17 requires torchvision>=0.11, which is not installed.
timm 1.0.9 requires torchvision, which is not installed.[0m[31m
[0mSuccessfully installed torch-2.4.1
Collecting transformers==4.30.2
  Using cached transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
Using cached transformers-4.30.2-py3-non

In [15]:
# Install necessary packages

# Import libraries
import torch
import pandas as pd
import numpy as np
import re
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertModel, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset
device = torch.device("cuda") 

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

PyTorch version: 2.4.1+cu121
CUDA available: True
CUDA version: 12.1


**2. Data Preprocessing**

In [16]:
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define text cleaning functions
def remove_non_alphanum(string):
    if isinstance(string, str):
        string = unidecode(string)
        string = re.sub(r'[^a-zA-Z0-9\s]', '', string)
    return string

def lowercase_and_remove_stopwords(string):
    words = string.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Load the data
train = pd.read_csv('/kaggle/input/movie-plots/train.txt', 
                    delimiter='\t', 
                    names=["Title", "Industry", "Genre", "Director", "Plot"])
test = pd.read_csv('/kaggle/input/movie-plots/test_no_labels.txt', 
                   delimiter='\t', 
                   names=["Title", "Industry", "Director", "Plot"])

# Clean the 'Plot' and 'Director' columns
for df in [train, test]:
    df['Plot'] = df['Plot'].apply(remove_non_alphanum).apply(lowercase_and_remove_stopwords)
    df['Director'] = df['Director'].apply(remove_non_alphanum).apply(lowercase_and_remove_stopwords)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**3. Handle Class Imbalance **

In [17]:
# Check for class imbalance
genre_counts = train['Genre'].value_counts()
print("Genre counts in training data:")
print(genre_counts)

Genre counts in training data:
Genre
drama        1676
comedy       1193
horror       1108
action       1059
romance       886
western       829
crime         541
animation     535
sci-fi        214
Name: count, dtype: int64


**3. Data Preparation**

a. Split Data into Training and Validation Sets

In [18]:
def split_data(data):
    # Use only the first 100 samples for faster execution
    train_data, tmp_data = train_test_split(
        data, 
        test_size=0.2, 
        shuffle=True,
        random_state=42
    )

    validation_data, test_data = train_test_split(
        tmp_data, 
        test_size=0.5, 
        shuffle=True,
        random_state=42
    )
    return train_data, validation_data, test_data

train_data, val_data, test_data = split_data(train)

b. Encode Labels and Directors

In [19]:
# Encode genres
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['Genre'])
val_data['label'] = label_encoder.transform(val_data['Genre'])
test_data['label'] = label_encoder.transform(test_data['Genre'])

**4. Tokenization**

In [20]:
# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.add_special_tokens({'additional_special_tokens': ['[DIRECTOR]']})

# Function to tokenize data
def tokenize_function(examples):
    director_names = [f"[DIRECTOR] {director}" for director in examples['Director']]
    texts_with_directors = [f"{director_name} {text}" for director_name, text in zip(director_names, examples['Plot'])]
    return tokenizer(texts_with_directors, padding='max_length', truncation=True, max_length=512)

# Prepare the datasets for Hugging Face
train_dataset = Dataset.from_pandas(train_data[['Plot', 'Director', 'label']])
val_dataset = Dataset.from_pandas(val_data[['Plot', 'Director', 'label']])
test_dataset = Dataset.from_pandas(test_data[['Plot', 'Director', 'label']])

# Apply the tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])




Map:   0%|          | 0/6432 [00:00<?, ? examples/s]

Map:   0%|          | 0/804 [00:00<?, ? examples/s]

Map:   0%|          | 0/805 [00:00<?, ? examples/s]

**5. Define the model**

In [28]:
import torch.nn as nn

# Define the model class
class DistilBertForGenreClassification(nn.Module):
    def __init__(self, num_labels):
        super(DistilBertForGenreClassification, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(self.bert.config.hidden_size, num_labels)
        )
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        # Get BERT output
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token output
        # Get logits from classifier
        logits = self.classifier(pooled_output)
        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {'loss': loss, 'logits': logits}


**6. Instantiate and Train the Model**

In [30]:
# Instantiate the model
num_labels = len(label_encoder.classes_)
model = DistilBertForGenreClassification(num_labels=num_labels)
model.bert.resize_token_embeddings(len(tokenizer))
# model = model.to(device)

# Check if the model's parameters are on the GPU
#for param in model.parameters():
    #print(param.device)
    
    
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=10,  # Reduced for faster execution
    # per_device_train_batch_size=4,  # Adjust batch size as needed
    # per_device_eval_batch_size=4,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=10,
    report_to=[],
)

# Define compute_metrics function for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}

# Data collator
data_collator = DataCollatorWithPadding(tokenizer)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the model
trainer.train()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2127,0.906176,0.670398
2,0.6426,0.894131,0.690299
3,0.5204,0.915814,0.715174
4,0.2584,1.261523,0.696517
5,0.3526,1.49835,0.707711


  state_dict = torch.load(best_model_path, map_location="cpu")


TrainOutput(global_step=4020, training_loss=0.5522524178250512, metrics={'train_runtime': 933.9738, 'train_samples_per_second': 68.867, 'train_steps_per_second': 8.608, 'total_flos': 0.0, 'train_loss': 0.5522524178250512, 'epoch': 5.0})

**7. Evaluate the Model on Test Data**

In [31]:
# Evaluate the model's performance on the test data
metrics = trainer.evaluate(test_dataset)
print(metrics)


{'eval_loss': 0.8825925588607788, 'eval_accuracy': 0.7329192546583851, 'eval_runtime': 7.2712, 'eval_samples_per_second': 110.711, 'eval_steps_per_second': 13.89, 'epoch': 5.0}


**8. Make Predictions on the Test Set**

In [25]:
def test_sample(test_dataset, model):
    # Set model to evaluation mode
    model.eval()
    model.to(device)  # Move model to device
    
    # Use DataLoader for batching
    from torch.utils.data import DataLoader
    test_dataloader = DataLoader(test_dataset, batch_size=4)
    
    predictions = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)        # Move input_ids to device
            attention_mask = batch['attention_mask'].to(device)  # Move attention_mask to device
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            preds = torch.softmax(logits, dim=1)
            
            # Move predictions to CPU and convert to numpy
            predictions.extend(preds.cpu().numpy())
    return np.array(predictions)


9. Interpret Predictions and Calculate Metrics

In [27]:
# Decode predictions to labels
predicted_class_indices = np.argmax(test_predictions, axis=1)
predicted_classes = label_encoder.inverse_transform(predicted_class_indices)

# True labels
true_class_indices = test_dataset['label']
true_classes = label_encoder.inverse_transform(true_class_indices)

# Calculate accuracy and F1 score
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(true_classes, predicted_classes)
f1 = f1_score(true_classes, predicted_classes, average='weighted')

print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test F1 Score: {f1:.4f}")


NameError: name 'test_predictions' is not defined

**10. Make Predictions on the Unlabeled Test Data**

In [None]:
# Prepare the unlabeled test data
unlabeled_test = test.copy()  # Assuming 'test' is your test_no_labels.txt data

# Since we don't have labels, we can use the same process without labels
unlabeled_test_dataset = Dataset.from_pandas(unlabeled_test[['Plot', 'Director']])

# Tokenize the data
unlabeled_test_dataset = unlabeled_test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
unlabeled_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Get predictions
def test_sample_unlabeled(test_dataset, model):
    # Set model to evaluation mode
    model.eval()
    model.to(device)  # Move model to device
    
    # Use DataLoader for batching
    from torch.utils.data import DataLoader
    test_dataloader = DataLoader(test_dataset, batch_size=4)
    
    predictions = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)        # Move input_ids to device
            attention_mask = batch['attention_mask'].to(device)  # Move attention_mask to device
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            preds = torch.softmax(logits, dim=1)
            
            # Move predictions to CPU and convert to numpy
            predictions.extend(preds.cpu().numpy())
    return np.array(predictions)


unlabeled_predictions = test_sample_unlabeled(unlabeled_test_dataset, model)

# Decode predictions to labels
unlabeled_predicted_class_indices = np.argmax(unlabeled_predictions, axis=1)
unlabeled_predicted_classes = label_encoder.inverse_transform(unlabeled_predicted_class_indices)


11. Save the Results

In [None]:
# Write predictions to a text file
with open('results.txt', 'w') as f:
    for title, pred_label in zip(unlabeled_test['Title'], unlabeled_predicted_classes):
        f.write(f"{title}: {pred_label}\n")
