<a href="https://colab.research.google.com/github/abdullahzunorain/Sentiment140_DistilBERT_FineTuning/blob/main/Sentiment140_DistilBERT_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#  Step 1: Install necessary libraries
!pip install transformers
!pip install datasets
!pip install torch torchvision torchaudio  # PyTorch
!pip install nltk
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install tqdm



In [19]:
# Step 2: Import libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [20]:
# Step 3: Load the Sentiment140 dataset
dataset = load_dataset('sentiment140')

In [21]:
# Step 4: Preprocess the dataset
# Rename the 'sentiment' column to 'target'
# The dataset has 'text' and 'sentiment' as the main columns
dataset = dataset.rename_column("sentiment", "target")

In [22]:
# Step 5: Map the sentiment values (0 for negative, 4 for positive) to 0 and 1
dataset = dataset.map(lambda x: {'target': 0 if x['target'] == 0 else 1})

In [24]:
# # Step 4 & 5: Preprocess and map sentiment values
# dataset = dataset.rename_column("sentiment", "labels")  # rename target column to 'labels' for Trainer compatibility
# dataset = dataset.map(lambda x: {'labels': 0 if x['labels'] == 0 else 1})  # ensure binary labels (0 and 1)

# # Step 6: Tokenize the texts
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# def tokenize_function(examples):
#     return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# tokenized_datasets = dataset.map(tokenize_function, batched=True)

# # Step 7: Set format for PyTorch
# tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# # Step 8: Split the dataset into train and test
# train_dataset = tokenized_datasets['train']
# test_dataset = tokenized_datasets['test']

# # Continue with Step 9 and beyond as before
# # Steps 9 to 12 are unchanged


In [25]:
# Step 6: Tokenize the texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [26]:
# Step 7: Set format for PyTorch
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'target'])

In [27]:
# Step 8: Split the dataset into train and test
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

In [28]:
# Step 9: Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Step 10: Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [40]:
# Step 11: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [61]:
# Step 1: Install necessary libraries
!pip install transformers datasets torch torchvision torchaudio nltk pandas matplotlib seaborn scikit-learn tqdm

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Step 3: Load a smaller subset of the Sentiment140 dataset for faster training
dataset = load_dataset('sentiment140')

# Step 4: Preprocess the dataset
# Rename the 'sentiment' column to 'labels' for Trainer compatibility
dataset = dataset.rename_column("sentiment", "labels")

# Step 5: Map the sentiment values (0 for negative, 4 for positive) to 0 and 1
dataset = dataset.map(lambda x: {'labels': 0 if x['labels'] == 0 else 1})

# Step 6: Tokenize the texts using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Step 7: Set format for PyTorch
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 8: Take a smaller subset for quicker training
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(10000))  # Adjust as needed based on training set size
small_test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(min(2000, len(tokenized_datasets['test']))))  # Adjust to avoid out of range


# Step 9: Initialize the smaller DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Step 10: Set optimized training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,                        # Reduced epochs for quicker training
    per_device_train_batch_size=16,            # Increased batch size
    per_device_eval_batch_size=16,
    warmup_steps=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,                                 # Enable mixed precision for faster training (if supported)
)

# Step 11: Initialize Trainer with the subset datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
)

# Step 12: Fine-tune the model on the subset data
trainer.train()




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.5262,0.387896


TrainOutput(global_step=625, training_loss=0.46659184341430665, metrics={'train_runtime': 60.5881, 'train_samples_per_second': 165.049, 'train_steps_per_second': 10.316, 'total_flos': 331168496640000.0, 'train_loss': 0.46659184341430665, 'epoch': 1.0})

In [57]:
# # Step 12: Fine-tune the model
# trainer.train()

In [58]:
# # Step 1: Install necessary libraries
# # Install transformers for model support, datasets for dataset handling, torch for PyTorch backend, and other data-related libraries
# !pip install transformers
# !pip install datasets
# !pip install torch torchvision torchaudio  # PyTorch
# !pip install nltk
# !pip install pandas
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn
# !pip install tqdm

# # Step 2: Import libraries
# # Import libraries required for processing and handling datasets, models, tokenization, and training
# import pandas as pd
# import torch
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
# from datasets import load_dataset

# # Step 3: Load the Sentiment140 dataset
# # Load the Sentiment140 dataset, a dataset used for sentiment analysis on Twitter data
# dataset = load_dataset('sentiment140')

# # Step 4: Preprocess the dataset
# # Rename the 'sentiment' column to 'labels' for compatibility with the Trainer, as it expects the target variable to be labeled 'labels'
# dataset = dataset.rename_column("sentiment", "labels")

# # Step 5: Map the sentiment values (0 for negative, 4 for positive) to 0 and 1
# # Map the 'labels' values: change 0 (negative) to 0 and 4 (positive) to 1 for binary classification
# dataset = dataset.map(lambda x: {'labels': 0 if x['labels'] == 0 else 1})

# # Step 6: Tokenize the texts
# # Load a pre-trained BERT tokenizer to tokenize the tweets into a format BERT can process
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Define a tokenization function that truncates or pads each text to a fixed length of 128 tokens
# def tokenize_function(examples):
#     return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# # Apply tokenization function to the entire dataset in batches for efficiency
# tokenized_datasets = dataset.map(tokenize_function, batched=True)

# # Step 7: Set format for PyTorch
# # Set format for PyTorch with required columns ('input_ids', 'attention_mask', and 'labels') for compatibility with the Trainer
# tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# # Step 8: Split the dataset into train and test
# # Divide the tokenized dataset into training and testing sets for model evaluation
# train_dataset = tokenized_datasets['train']
# test_dataset = tokenized_datasets['test']

# # Step 9: Initialize the model
# # Load a pre-trained BERT model for sequence classification with 2 output labels (for binary classification)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# # Step 10: Set training arguments
# # Configure training parameters, including output directory, number of epochs, batch size, and logging settings
# training_args = TrainingArguments(
#     output_dir='./results',              # Directory for saving model results
#     num_train_epochs=1,                  # Number of training epochs
#     per_device_train_batch_size=8,       # Batch size for training
#     per_device_eval_batch_size=8,        # Batch size for evaluation
#     warmup_steps=1,                      # Warmup steps for learning rate scheduler
#     weight_decay=0.01,                   # Weight decay for regularization
#     logging_dir='./logs',                # Directory for logging
#     logging_steps=0.2,                    # Interval for logging
#     evaluation_strategy="epoch",         # Evaluate the model at the end of each epoch
#     save_strategy="epoch",               # Save the model at the end of each epoch
#     load_best_model_at_end=True,         # Load the best model based on evaluation during training
# )

# # Step 11: Initialize Trainer
# # Set up the Trainer, a helper class for training and evaluation, with the model, training arguments, and datasets
# trainer = Trainer(
#     model=model,                         # The model to train
#     args=training_args,                  # The training configuration
#     train_dataset=train_dataset,         # Training dataset
#     eval_dataset=test_dataset,           # Evaluation dataset
# )

# # Step 12: Fine-tune the model
# # Start training the model with the specified training arguments
# trainer.train()


In [62]:
# Step 13: Evaluate the model
trainer.evaluate()

{'eval_loss': 0.38789647817611694,
 'eval_runtime': 1.0045,
 'eval_samples_per_second': 495.763,
 'eval_steps_per_second': 31.856,
 'epoch': 1.0}

In [42]:
# # Step 11: Initialize Trainer

# def compute_metrics(pred):
#     """Computes and returns a dictionary of metrics (accuracy, f1, etc.)

#     Args:
#         pred: Predictions from the model

#     Returns:
#         A dictionary containing the computed metrics.
#     """
# # Step 11: Initialize Trainer

# def compute_metrics(pred):
#     """Computes and returns a dictionary of metrics (accuracy, f1, etc.)

#     Args:
#         pred: Predictions from the model

#     Returns:
#         A dictionary containing the computed metrics.
#     """
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

In [43]:
# from torch.utils.data import Dataset

# class YourDataset(Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx])  # Add the 'labels' key here
#         return item

#     def __len__(self):
#         return len(self.labels)

In [35]:
# # Step 11: Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     compute_metrics=compute_metrics, # Add this line
# )


# def compute_metrics(pred):
#     """Computes and returns a dictionary of metrics (accuracy, f1, etc.)

#     Args:
#         pred: Predictions from the model

#     Returns:
#         A dictionary containing the computed metrics.
#     """
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

In [44]:
# from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")


In [45]:
# from torch.nn import CrossEntropyLoss

# def compute_loss(model, inputs):
#     outputs = model(**inputs)
#     logits = outputs.logits
#     labels = inputs["labels"]
#     loss_fct = CrossEntropyLoss()
#     loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
#     return loss


In [46]:
# training_args = TrainingArguments(
#     output_dir="your_output_dir",
#     evaluation_strategy="steps",
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     run_name="custom_run_name"  # Update the run name here
# )


In [63]:
# Step 14: Save the model
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')

('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')