In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install torch transformers datasets wandb nltk




In [3]:
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import wandb


2024-08-04 06:56:38.085519: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 06:56:38.085628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 06:56:38.202758: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
def print_gpu_info():
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # Convert bytes to GB
        print(f"GPU: {gpu_name}")
        print(f"Total GPU Memory: {gpu_memory:.2f} GB")
        print(f"CUDA Version: {torch.version.cuda}")
    else:
        print("No GPU available. Using CPU.")
    

In [5]:
print_gpu_info()

GPU: Tesla P100-PCIE-16GB
Total GPU Memory: 17.06 GB
CUDA Version: 12.1


In [6]:
wandb.login(key="c179a7d6cf40b2eacec3bf988f78ecf522e70c6c")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import nltk
from nltk.corpus import stopwords
import wandb

print("Starting the script...")

# Download NLTK stop words
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
print("Stop words loaded.")

class SentimentModel(nn.Module):
    def __init__(self, roberta_model, hidden_size=768, num_classes=2):
        super(SentimentModel, self).__init__()
        self.roberta = roberta_model
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(hidden_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        flattened = self.flatten(roberta_output[:, 0, :])  # Use the [CLS] token representation
        fc1_output = torch.relu(self.fc1(flattened))
        logits = self.fc2(fc1_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        
        return (loss, logits) if loss is not None else logits

print("Model class defined.")

# Load the IMDB dataset
print("Loading IMDB dataset...")
imdb_dataset = load_dataset("imdb")
print("IMDB dataset loaded.")

# Initialize the RoBERTa tokenizer and model
print("Initializing RoBERTa tokenizer and model...")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')
print("RoBERTa tokenizer and model initialized.")

# Function to remove stop words
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Preprocess and tokenize the dataset
def preprocess_and_tokenize(examples):
    # Remove stop words
    examples['text'] = [remove_stop_words(text) for text in examples['text']]
    # Tokenize
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("Preprocessing and tokenizing dataset...")
tokenized_datasets = imdb_dataset.map(preprocess_and_tokenize, batched=True)
print("Dataset preprocessed and tokenized.")

# Initialize the model
model = SentimentModel(roberta_model)
print("SentimentModel initialized.")

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Initialize wandb
print("Initializing wandb...")
wandb.init(project="imdb-sentiment-analysis", name="roberta-gru-run")
print("wandb initialized.")

# Set up training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",  # Enable wandb logging
    run_name="roberta-gru-run"  # Set a specific run name
)
print("Training arguments set up.")

# Initialize the Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
print("Trainer initialized.")

# Train the model
print("Starting training...")
trainer.train()
print("Training completed.")

# Evaluate the model
print("Evaluating the model...")
eval_results = trainer.evaluate()
print("Evaluation results:")
print(eval_results)

# End wandb run
print("Ending wandb run...")
wandb.finish()
print("Script completed.")

Starting the script...
Stop words loaded.
Model class defined.
Loading IMDB dataset...


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

IMDB dataset loaded.
Initializing RoBERTa tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa tokenizer and model initialized.
Preprocessing and tokenizing dataset...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33maravinthakshan[0m ([33maravinthakshan-manipal-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


Dataset preprocessed and tokenized.
SentimentModel initialized.
Initializing wandb...


[34m[1mwandb[0m: wandb version 0.17.5 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.17.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240804_070042-j67hw4ni[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mroberta-gru-run[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/aravinthakshan-manipal-institute-of-technology/imdb-sentiment-analysis[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/aravinthakshan-manipal-institute-of-technology/imdb-sentiment-analysis/runs/j67hw4ni[0m


wandb initialized.
Setting up training arguments...
Training arguments set up.
Initializing Trainer...
Trainer initialized.
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2544,0.268512,0.90788,0.906735,0.91815,0.8956
2,0.2339,0.218014,0.92568,0.923564,0.950627,0.898
3,0.1102,0.254984,0.93128,0.930882,0.936306,0.92552


Training completed.
Evaluating the model...


Evaluation results:
{'eval_loss': 0.2180139273405075, 'eval_accuracy': 0.92568, 'eval_f1': 0.9235642586802699, 'eval_precision': 0.9506266937669376, 'eval_recall': 0.898, 'eval_runtime': 467.5954, 'eval_samples_per_second': 53.465, 'eval_steps_per_second': 0.836, 'epoch': 3.0}
Ending wandb run...


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁▆█▆
[34m[1mwandb[0m:                 eval/f1 ▁▆█▆
[34m[1mwandb[0m:               eval/loss █▁▆▁
[34m[1mwandb[0m:          eval/precision ▁█▅█
[34m[1mwandb[0m:             eval/recall ▁▂█▂
[34m[1mwandb[0m:            eval/runtime ▁▅▃█
[34m[1mwandb[0m: eval/samples_per_second █▃▆▁
[34m[1mwandb[0m:   eval/steps_per_second ▁▁▁▁
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:         train/grad_norm ▁▄▂▁▁▁▂▁▁▂▂▂▂▁▁▁▁▄▂▁▂▂▁▂▁▂▁▁▂▂▂█▁▁▁▂▁▁▂▁
[34m[1mwandb[0m:     train/learning_rate ▂▃▄▆████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:              train/loss █▆▅▄▄▄▄▅▅▄▅▃▅▃▃▅▂▃▂▄▂▃▄▃▂▂▃▁▂▂▃▁▁▂▃▂▃▂▂▂
[34m[1mwandb[0m: 
[34m[1

Script completed.
