<a href="https://colab.research.google.com/github/bhanudeergasi/NullClass_Data_science_internship/blob/main/model_training1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# # Setup and Imports


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


In [None]:
project_path = '/content/drive/MyDrive/TASK1'  # Change if your folder is named differently

 Data Loading and Initial Preprocessing


In [None]:

# Load the dataset from CSV
# Use on_bad_lines='skip' to handle potential parsing errors and quoting=3 for QUOTE_NONE
df = pd.read_csv(f'{project_path}/IMDB Dataset.csv', on_bad_lines='skip', quoting=3)


# Drop empty/null reviews and empty strings first from the full dataset
df = df[df['review'].notnull()]
df = df[df['review'].astype(str).str.strip() != '']

# Encode the sentiment labels for the cleaned full dataframe
df['sentiment'] = pd.to_numeric(df['sentiment'].map({'positive': 1, 'negative': 0}), errors='coerce').astype('Int64').fillna(0)

# Separate positive and negative reviews
positive_df = df[df['sentiment'] == 1]
negative_df = df[df['sentiment'] == 0]

# Sample from the larger class to match the size of the smaller class, or sample a fixed number from each
# Let's aim for a balanced sample, e.g., 500 positive and 500 negative, totaling 1000
sample_size_per_class = 500 # Define sample size per class
if len(positive_df) >= sample_size_per_class and len(negative_df) >= sample_size_per_class:
    sampled_positive = positive_df.sample(n=sample_size_per_class, random_state=42)
    sampled_negative = negative_df.sample(n=sample_size_per_class, random_state=42)
    sampled_df = pd.concat([sampled_positive, sampled_negative]).sample(frac=1, random_state=42).reset_index(drop=True)
else:
    # Handle case where one class has fewer than sample_size_per_class instances
    print("Warning: One class has fewer instances than the desired sample size per class. Using all available instances from the smaller class.")
    min_class_size = min(len(positive_df), len(negative_df))
    sampled_positive = positive_df.sample(n=min_class_size, random_state=42)
    sampled_negative = negative_df.sample(n=min_class_size, random_state=42)
    sampled_df = pd.concat([sampled_positive, sampled_negative]).sample(frac=1, random_state=42).reset_index(drop=True)


# Print value counts for the sampled and cleaned dataframe
print("Sentiment distribution in sampled and cleaned dataset (Balanced):")
print(sampled_df['sentiment'].value_counts())


Sentiment distribution in sampled and cleaned dataset (Balanced):
sentiment
0    190
1    190
Name: count, dtype: Int64


Tokenization and Dataset Creation


In [None]:

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the reviews from the sampled DataFrame
tokens = tokenizer(
    sampled_df['review'].tolist(), # Use review column from sampled_df
    padding=True,
    truncation=True,
    max_length=128, # Keeping max_length consistent for now
    return_tensors='pt' # Return PyTorch tensors
)

# Get labels from the sampled_df as a PyTorch tensor
# The sentiment column is now guaranteed to have integer values (0 or 1) after balancing
labels = torch.tensor(sampled_df['sentiment'].values.astype('int64'))

# Print shapes to confirm data is ready
print("Shape of input_ids tensor:", tokens['input_ids'].shape)
print("Shape of attention_mask tensor:", tokens['attention_mask'].shape)
print("Shape of labels tensor:", labels.shape)


# Create a custom Dataset class
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Get a single item (input_ids, attention_mask)
        item = {key: val[idx] for key, val in self.encodings.items()}
        # Get the corresponding label and convert to torch.long
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        # The number of items in the dataset is the number of labels
        return len(self.labels)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Shape of input_ids tensor: torch.Size([380, 128])
Shape of attention_mask tensor: torch.Size([380, 128])
Shape of labels tensor: torch.Size([380])


Train Test split (Split tokens and label)


In [None]:

# Split based on indices to ensure corresponding inputs and masks are split together
import numpy as np # Import numpy if not already imported

# Get indices for splitting
train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=0.2, random_state=42, stratify=labels) # --- Fix 2: Add stratify=labels for stratified split ---

# Use indices to create training and validation datasets
train_dataset = IMDBDataset(
    {key: tokens[key][train_idx] for key in tokens.keys()}, # Select token data using train_idx
    labels[train_idx] # Select labels using train_idx
)
val_dataset = IMDBDataset(
    {key: tokens[key][val_idx] for key in tokens.keys()}, # Select token data using val_idx
    labels[val_idx] # Select labels using val_idx
)


# Print dataset sizes to verify the split
print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

# Print sentiment distribution in train and validation sets after stratified split
print("\nSentiment distribution in training dataset:")
train_labels_list = [train_dataset[i]['labels'].item() for i in range(len(train_dataset))]
print(pd.Series(train_labels_list).value_counts())

print("\nSentiment distribution in validation dataset:")
val_labels_list = [val_dataset[i]['labels'].item() for i in range(len(val_dataset))]
print(pd.Series(val_labels_list).value_counts())


Training dataset size: 304
Validation dataset size: 76

Sentiment distribution in training dataset:
1    152
0    152
Name: count, dtype: int64

Sentiment distribution in validation dataset:
1    38
0    38
Name: count, dtype: int64


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


Load the BERT model for sequence classification



In [None]:

# Explicitly set problem_type for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, problem_type="single_label_classification")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(train_labels_list),
                                     y=train_labels_list)
weights = torch.tensor(class_weights, dtype=torch.float)
print("\nComputed Class Weights:", weights)

# If running on GPU, move weights to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
weights = weights.to(device)
model.to(device) # Move model to device




Computed Class Weights: tensor([1., 1.])


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Define Training Arguments


In [None]:

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save results and checkpoints
    num_train_epochs=5,  # --- Fix 4: Increased number of training epochs ---
    per_device_train_batch_size=8,  # Batch size per GPU/CPU for training
    per_device_eval_batch_size=8,  # Batch size per GPU/CPU for evaluation
    warmup_steps=500,  # Number of steps for learning rate warmup
    weight_decay=0.01,  # Strength of weight decay
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,  # Log training metrics every 10 steps
    eval_strategy='epoch', # Evaluate the model at the end of each epoch
    save_strategy='epoch', # Save a checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model (based on eval_loss) at the end of training
    metric_for_best_model='eval_loss', # Metric to monitor for determining the best model
    greater_is_better=False # For 'eval_loss', smaller is better

)


trainer = Trainer(
    model=model,  # The BERT model to train
    args=training_args,  # The training arguments defined above
    train_dataset=train_dataset,  # The dataset for training
    eval_dataset=val_dataset,  # The dataset for validation (evaluation during training)
    tokenizer=tokenizer,  # The tokenizer (used by the Trainer for potential tokenization if needed, though we pre-tokenized)

)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
# Train the model
print("Starting model training...")
trainer.train()
print("Training finished.")



Starting model training...


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


Epoch,Training Loss,Validation Loss
1,0.6581,0.534939
2,0.4922,0.330041
3,0.3922,0.293305


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


Epoch,Training Loss,Validation Loss
1,0.6581,0.534939
2,0.4922,0.330041
3,0.3922,0.293305
4,0.2761,0.297116
5,0.2033,0.325191


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


Training finished.


Saving thr trained Model and tokenzer

In [None]:
# Save the trained model and tokenizer
print("Saving model and tokenizer...")

model.save_pretrained(f'{project_path}/imdb_model')
tokenizer.save_pretrained(f'{project_path}/imdb_model')

print("Model and tokenizer saved to ./imdb_model")

# Evaluate the model on the validation dataset
print("\nEvaluating the model on the validation set...")
predictions = trainer.predict(val_dataset)


Saving model and tokenizer...
Model and tokenizer saved to ./imdb_model

Evaluating the model on the validation set...


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


In [None]:
# Get the predicted class IDs (index of the highest logit)
pred_labels = predictions.predictions.argmax(-1)

# Get the true labels from the validation dataset
# We iterate through the val_dataset to get the original labels
# val_labels_list was already created after stratified split
true_labels = val_labels_list



Printing the evaluation metrics

In [None]:
# Print evaluation metrics
print("\nEvaluation Metrics:")
print(confusion_matrix(true_labels, pred_labels))
print(classification_report(true_labels, pred_labels, digits=4)) # Use digits for more precision



Evaluation Metrics:
[[30  8]
 [ 0 38]]
              precision    recall  f1-score   support

           0     1.0000    0.7895    0.8824        38
           1     0.8261    1.0000    0.9048        38

    accuracy                         0.8947        76
   macro avg     0.9130    0.8947    0.8936        76
weighted avg     0.9130    0.8947    0.8936        76



In [None]:
# Print individual metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print("accuracy score:", accuracy_score(true_labels, pred_labels))
print("precision score:", precision_score(true_labels, pred_labels))
print("recall score:", recall_score(true_labels, pred_labels))
print("f1_score:", f1_score(true_labels, pred_labels))


accuracy score: 0.8947368421052632
precision score: 0.8260869565217391
recall score: 1.0
f1_score: 0.9047619047619048


In [None]:
# Check the sentiment distribution of the original sampled and cleaned data again (optional)
print("\nSentiment distribution in sampled and cleaned dataset:")
print(sampled_df['sentiment'].value_counts())



Sentiment distribution in sampled and cleaned dataset:
sentiment
0    190
1    190
Name: count, dtype: Int64
