In [16]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [17]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

CUDA available: True
GPU name: Quadro M1200


In [18]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-team/Text-Datasets/refs/heads/main/Reddit_Data.csv')
df.dropna(inplace=True)
df.index.name = 'id'

In [19]:
df.rename(columns={'clean_comment':'text'},inplace=True)

In [20]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [21]:
df.text.iloc[0]

' family mormon have never tried explain them they still stare puzzled from time time like some kind strange creature nonetheless they have come admire for the patience calmness equanimity acceptance and compassion have developed all the things buddhism teaches '

In [22]:
df.category.value_counts()

category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

In [23]:
df.category.value_counts()

category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

In [24]:
possible_labels = df.category.unique()

In [25]:
possible_labels.sort()

In [26]:
# -1:0, 0:1, 1:2
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [27]:
df['label'] = df.category.replace(label_dict)

In [28]:
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,family mormon have never tried explain them t...,1,2
1,buddhism has very much lot compatible with chr...,1,2
2,seriously don say thing first all they won get...,-1,0
3,what you have learned yours and only yours wha...,0,1
4,for your own benefit you may want read living ...,1,2


In [29]:
df['label'].value_counts()

label
2    15830
1    13042
0     8277
Name: count, dtype: int64

# Training/Validation Split

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size = 0.15, random_state=17, stratify = df.label.values)

In [32]:
df['data_type'] = ['not_set']*df.shape[0]

In [33]:
df.head()

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,family mormon have never tried explain them t...,1,2,not_set
1,buddhism has very much lot compatible with chr...,1,2,not_set
2,seriously don say thing first all they won get...,-1,0,not_set
3,what you have learned yours and only yours wha...,0,1,not_set
4,for your own benefit you may want read living ...,1,2,not_set


In [34]:
import sys
print(sys.executable)

c:\Users\saeed\AppData\Local\Programs\Python\Python39\python.exe


In [35]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [46]:
print("Number of training samples:", df[df.data_type == 'train'].shape[0])
print("Columns in DataFrame:", df.columns.tolist())

Number of training samples: 0
Columns in DataFrame: ['text', 'category', 'label', 'data_type']


In [47]:
print("Unique values in 'data_type':", df['data_type'].unique())

Unique values in 'data_type': ['not_set']


In [48]:
from sklearn.model_selection import train_test_split

# Split data (e.g., 80% train, 20% test)
train_texts, test_texts = train_test_split(df['text'].tolist(), test_size=0.2, random_state=42)

In [50]:
# Encode training data
encoded_data_train = tokenizer.batch_encode_plus(
    train_texts,  # Use the manually split training data
    add_special_tokens=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

In [51]:
# encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type=='train'].text.values, add_special_tokens=True,return_attention_mask=True,pad_to_max_length=True,max_length=256,return_tensors='pt')

In [53]:
from sklearn.model_selection import train_test_split

# First split: 80% train, 20% temp (val + test)
train_texts, temp_texts = train_test_split(df['text'].tolist(), test_size=0.2, random_state=42)

# Second split: 50% val, 50% test (10% each of total)
val_texts, test_texts = train_test_split(temp_texts, test_size=0.5, random_state=42)

# Encode validation data
encoded_data_val = tokenizer.batch_encode_plus(
    val_texts,
    add_special_tokens=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

In [54]:
# encoded_data_val= tokenizer.batch_encode_plus(df[df.data_type=='val'].text.values, add_special_tokens=True,return_attention_mask=True,pad_to_max_length=True,max_length=256,return_tensors='pt')

In [55]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

In [57]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val= encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [62]:
print(f"Total samples: {len(df)}")
print(f"Train texts: {len(train_texts)}")
print(f"Val texts: {len(val_texts)}")
print(f"Test texts: {len(test_texts)}")
print(f"Original labels: {df['label'].shape}")

Total samples: 37149
Train texts: 29719
Val texts: 3715
Test texts: 3715
Original labels: (37149,)


In [63]:
# Get all labels as numpy array first
all_labels = df['label'].values

# Split labels to exactly match text splits
train_labels, temp_labels = train_test_split(all_labels, test_size=0.2, random_state=42)
val_labels, test_labels = train_test_split(temp_labels, test_size=0.5, random_state=42)

# Convert to tensors
labels_train = torch.tensor(train_labels)
labels_val = torch.tensor(val_labels)
labels_test = torch.tensor(test_labels)

In [65]:
# First encode the test texts (we already did train and val)
encoded_data_test = tokenizer.batch_encode_plus(
    test_texts,
    add_special_tokens=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

# Extract test tensors
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

In [66]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

print(f"Train dataset size: {len(dataset_train)}")
print(f"Val dataset size: {len(dataset_val)}")
print(f"Test dataset size: {len(dataset_test)}")

Train dataset size: 29719
Val dataset size: 3715
Test dataset size: 3715


In [67]:
len(dataset_train)

29719

In [68]:
len(dataset_val)

3715

#  Setting up BERT Pretrained Model

In [69]:
from transformers import BertForSequenceClassification

In [70]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(label_dict), output_attentions=False,output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [72]:
batch_size = 16 #32

In [73]:
dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train),batch_size=batch_size)

In [74]:
dataloader_val = DataLoader(dataset_val,sampler=RandomSampler(dataset_val),batch_size=32)

In [75]:
import sys
print(sys.executable)
print(sys.version)

c:\Users\saeed\AppData\Local\Programs\Python\Python39\python.exe
3.9.4 (tags/v3.9.4:1f2e308, Apr  6 2021, 13:40:21) [MSC v.1928 64 bit (AMD64)]


In [79]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup


In [80]:
import torch
import transformers
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")

PyTorch version: 2.5.1+cu124
Transformers version: 4.47.1


In [81]:
optimizer = AdamW(model.parameters(),lr=1e-5, #2e-5 > 5e-5
                 eps=1e-8)

In [82]:
epochs=10

scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=len(dataloader_train)*epochs)

# Defining our Performance Metrics

In [84]:
import numpy as np
from sklearn.metrics import f1_score

In [85]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [86]:
def accuracy_per_class(preds, labels):
    labels_dict_inverse = {v: k for k,v in label_dict.items()}

    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {labels_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

# Creating our Training Loop

In [87]:
import random

seed_val = 8
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [88]:
import torch
print(torch.__version__)  # Check PyTorch version
print(torch.version.cuda)  # Check the CUDA version PyTorch is using

2.5.1+cu124
12.4


In [89]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [90]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


In [91]:
import torch
import os
from sklearn.metrics import accuracy_score


# Define checkpoint directory
checkpoint_dir = '/kaggle/working/checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Initialize variables to track the best validation loss
best_val_loss = float('inf')

for epoch in tqdm(range(1, epochs + 1)):

    model.train()

    loss_train_total = 0
    correct_predictions = 0  # To keep track of correct predictions
    total_predictions = 0     # To keep track of total predictions

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)

    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()

        # Get predictions
        preds = torch.argmax(outputs[1], dim=1)  # Assuming the logits are in outputs[1]

        # Update correct and total counts
        correct_predictions += (preds == batch[2]).sum().item()
        total_predictions += batch[2].size(0)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    training_accuracy = correct_predictions / total_predictions  # Calculate training accuracy

    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Training Accuracy: {training_accuracy:.3f}')  # Print training accuracy

    # Evaluate on the validation set
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)

    # Calculate validation accuracy
    predictions = np.argmax(predictions, axis=1).flatten()
    val_accuracy = accuracy_score(true_vals.flatten(), predictions)

    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
    tqdm.write(f'Validation Accuracy: {val_accuracy:.3f}')

    # Save checkpoint if the validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': val_loss
        }, checkpoint_path)
        tqdm.write(f'Checkpoint saved at {checkpoint_path}')

# Save the final model after training
final_model_path = './final_model.pth'
torch.save(model.state_dict(), final_model_path)
tqdm.write(f'Final model saved at {final_model_path}')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1858 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [92]:
import torch
import os
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np

# Check GPU availability and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"GPU name: {torch.cuda.get_device_name(0)}") if torch.cuda.is_available() else print("No GPU available")

# Define checkpoint directory
checkpoint_dir = '/kaggle/working/checkpoints'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Initialize variables to track the best validation loss
best_val_loss = float('inf')

# Move model to GPU
model = model.to(device)

for epoch in tqdm(range(1, epochs + 1)):
    model.train()
    loss_train_total = 0
    correct_predictions = 0
    total_predictions = 0

    progress_bar = tqdm(dataloader_train, desc=f'Epoch {epoch}', leave=False)

    for batch in progress_bar:
        model.zero_grad()
        
        # Move batch tensors to GPU
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()

        # Get predictions
        logits = outputs[1] if isinstance(outputs, tuple) else outputs.logits
        preds = torch.argmax(logits, dim=1)
        
        correct_predictions += (preds == batch[2]).sum().item()
        total_predictions += batch[2].size(0)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': f'{loss.item()/len(batch):.3f}'})

    # Training metrics
    loss_train_avg = loss_train_total / len(dataloader_train)
    training_accuracy = correct_predictions / total_predictions
    
    print(f'\nEpoch {epoch}')
    print(f'Training loss: {loss_train_avg}')
    print(f'Training Accuracy: {training_accuracy:.3f}')

    # Validation
    model.eval()
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    predictions = np.argmax(predictions, axis=1).flatten()
    val_accuracy = accuracy_score(true_vals.flatten(), predictions)

    print(f'Validation loss: {val_loss}')
    print(f'F1 Score (weighted): {val_f1}')
    print(f'Validation Accuracy: {val_accuracy:.3f}')

    # Save checkpoint if improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': val_loss
        }, checkpoint_path)
        print(f'Checkpoint saved at {checkpoint_path}')

# Final model save
final_model_path = './final_model.pth'
torch.save(model.state_dict(), final_model_path)
print(f'Final model saved at {final_model_path}')

Using device: cuda
GPU name: Quadro M1200


  0%|          | 0/10 [06:27<?, ?it/s]


KeyboardInterrupt: 