# ***Imports***

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import re


from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import nltk
import csv
from nltk.corpus import stopwords
nltk.download('stopwords')
from tqdm import tqdm

# Training
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, AdamW, LlamaTokenizer, LlamaModel, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
#from nlpaug.augmenter.word import SynonymAug, BackTranslationAug
from sklearn.metrics import f1_score, accuracy_score


# Evaluating and plotting 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
import math

from transformers import RobertaForMaskedLM, DataCollatorForLanguageModeling
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import TrainingArguments, Trainer


[nltk_data] Downloading package stopwords to /home/amitfi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [None]:
from nlpaug.augmenter.word import SynonymAug, BackTranslationAug

## ***MLM task***

In [None]:
df = pd.read_csv("reddit_opinion_PSE_ISR.csv",  on_bad_lines='skip') # correct line to load the data 

In [None]:
sub_df_100_000 = df.sample(n=100_000, random_state=209122282)
sub_df_300_000 = df.sample(n=300_000, random_state=209122282)
sub_df_500_000 = df.sample(n=500_000, random_state=209122282)

In [None]:
df['self_text'].iloc[10000]

In [None]:
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForMaskedLM
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(examples):
    texts = [text for text in examples['self_text'] if isinstance(text, str)]
    return tokenizer(texts, truncation=True, padding='max_length', max_length=512)

sub_df_100_000 = sub_df_100_000.reset_index(drop=True)  # Reset index and drop the old one
sub_df_100_000 = sub_df_100_000.drop('Unnamed: 0', axis=1)  # Remove the problematic column
dataset_100_000 = Dataset.from_pandas(sub_df_100_000)
tokenized_dataset_100_000 = dataset_100_000.map(tokenize, batched=True, remove_columns=dataset_100_000.column_names)
tokenized_dataset_100_000.save_to_disk('./tokenized_reddit_data_100_000')
print("done 100_000")

sub_df_300_000 = sub_df_300_000.reset_index(drop=True)  # Reset index and drop the old one
sub_df_300_000 = sub_df_300_000.drop('Unnamed: 0', axis=1)  # Remove the problematic column
dataset_300_000 = Dataset.from_pandas(sub_df_300_000)
tokenized_dataset_300_000 = dataset_300_000.map(tokenize, batched=True, remove_columns=dataset_300_000.column_names)
tokenized_dataset_300_000.save_to_disk('./tokenized_reddit_data_300_000')
print("done 300_000")

sub_df_500_000 = sub_df_500_000.reset_index(drop=True)  # Reset index and drop the old one
sub_df_500_000 = sub_df_500_000.drop('Unnamed: 0', axis=1)  # Remove the problematic column
dataset_500_000 = Dataset.from_pandas(sub_df_500_000)
tokenized_dataset_500_000 = dataset_500_000.map(tokenize, batched=True, remove_columns=dataset_500_000.column_names)
tokenized_dataset_500_000.save_to_disk('./tokenized_reddit_data_500_000')
print("done 500_000")

In [103]:
from datasets import load_from_disk
data_set_size = '300_000'
loaded_dataset = load_from_disk(f'./tokenized_reddit_data_{data_set_size}')

In [104]:
tokenized_dataset = loaded_dataset

In [None]:
# Initialize the model
model = RobertaForMaskedLM.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Create data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Standard masking probability
)

In [106]:
# Calculate split sizes
total_size = len(tokenized_dataset)
val_size = int(total_size * 0.1)
train_size = total_size - val_size

print(f"Total dataset size: {total_size}")
print(f"Training set size: {train_size}")
print(f"Validation set size: {val_size}")

# Initialize empty lists for train and validation indices
train_indices = []
val_indices = []

# Create batched index assignment with progress bar
batch_size = 10000
num_batches = math.ceil(total_size / batch_size)

print("Splitting dataset into train and validation sets...")
for i in tqdm(range(num_batches)):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, total_size)
    
    # Generate random numbers for this batch
    batch_indices = np.random.rand(end_idx - start_idx)
    
    # Assign indices based on split ratio
    for j, rand_val in enumerate(batch_indices):
        if rand_val < 0.1:  # 10% for validation
            val_indices.append(start_idx + j)
        else:
            train_indices.append(start_idx + j)


# Create the split datasets
print("Creating training dataset...")
train_dataset = tokenized_dataset.select(train_indices)
print("Creating validation dataset...")
val_dataset = tokenized_dataset.select(val_indices)

# Optional: Save the split datasets
print("Saving split datasets...")
train_dataset.save_to_disk(f'./train_dataset_{data_set_size}')
val_dataset.save_to_disk(f'./val_dataset_{data_set_size}')

print("Split complete!")

Total dataset size: 299997
Training set size: 269998
Validation set size: 29999
Splitting dataset into train and validation sets...


  0%|          | 0/30 [00:00<?, ?it/s]

Creating training dataset...
Creating validation dataset...
Saving split datasets...


Saving the dataset (0/2 shards):   0%|          | 0/269905 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/30092 [00:00<?, ? examples/s]

Split complete!


In [107]:
# Define metrics for MLM evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Identify valid positions (non-padding and actually masked tokens)
    mask_positions = labels != -100
    
    if not np.any(mask_positions):
        return {"mlm_accuracy": 0.0}
    
    # Calculate accuracy only on masked positions
    correct_predictions = predictions[mask_positions] == labels[mask_positions]
    accuracy = correct_predictions.mean()
    
    # Add more detailed metrics
    return {
        "mlm_accuracy": float(accuracy),
        "num_masked_tokens": int(mask_positions.sum()),
        "num_correct_predictions": int(correct_predictions.sum())
    }


In [108]:
training_args = TrainingArguments(
    output_dir= "./roberta-reddit-mlm-final_300_000",
    learning_rate=1e-5,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,    # Increased eval batch size
    gradient_accumulation_steps=8,
    warmup_steps=300,
    weight_decay=0.01,
    logging_steps=50,                  # More frequent logging
    save_strategy="steps",
    save_steps=1000,                    # Less frequent saving
    evaluation_strategy="steps",
    eval_steps=5000,                     # More frequent evaluation
    fp16=True,
    gradient_checkpointing=True,
    eval_accumulation_steps=32,           # Increased eval accumulation
    log_level="info",                      # Ensure logs appear
    report_to="none"                       # Prevent logging to external tools
)

from transformers import TrainerCallback

class LossLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            print(f"Step: {state.global_step}, Logs: {logs}")





Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.add_callback(LossLoggerCallback())

print("Starting training...")
training_output = trainer.train()
print("\nTraining completed!")
print(f"Final training metrics: {training_output.metrics}")

In [110]:
# Save both the model and tokenizer
output_dir = f"./roberta-reddit-mlm-final_{data_set_size}"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model checkpoint to ./roberta-reddit-mlm-final_300_000
Configuration saved in ./roberta-reddit-mlm-final_300_000/config.json
Model weights saved in ./roberta-reddit-mlm-final_300_000/pytorch_model.bin
tokenizer config file saved in ./roberta-reddit-mlm-final_300_000/tokenizer_config.json
Special tokens file saved in ./roberta-reddit-mlm-final_300_000/special_tokens_map.json


('./roberta-reddit-mlm-final_300_000/tokenizer_config.json',
 './roberta-reddit-mlm-final_300_000/special_tokens_map.json',
 './roberta-reddit-mlm-final_300_000/vocab.json',
 './roberta-reddit-mlm-final_300_000/merges.txt',
 './roberta-reddit-mlm-final_300_000/added_tokens.json')

In [None]:
from transformers import RobertaForMaskedLM, RobertaTokenizer
output_dir = 'roberta-reddit-mlm-final_300_000/'

from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(
    output_dir, 
    num_labels=3 
)

In [113]:
# Load the dataset
file_path_train = 'train_data_new.csv'
file_path_test = 'test_data_new.csv'

train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)


In [114]:
len(train_df), len(test_df)

(888, 222)

In [115]:
def preprocess_data(texts, labels, tokenizer, test=False):
    # Convert labels to float first
    labels = [float(label) for label in labels]
    # Train-test split
    if test:
        train_texts, val_texts, train_labels, val_labels = texts, texts, labels, labels
    else:
        train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)
        
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()
    
    # Convert the float labels to the desired mapping: -1.0 -> 0, 0.0 -> 1, 1.0 -> 2
    mapping = {-1.0: 0, 0.0: 1, 1.0: 2}
    train_labels = [mapping[label] for label in train_labels]
    val_labels = [mapping[label] for label in val_labels]
    
    # Now fit and transform with these mapped labels
    train_labels = label_encoder.fit_transform(train_labels)
    val_labels = label_encoder.transform(val_labels)
    
    # Save the label mapping
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    
    # Tokenize the text
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
    
    return train_encodings, val_encodings, train_labels, val_labels, label_mapping

In [116]:

training_args = TrainingArguments(
    output_dir="./roberta-classification",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    warmup_steps=50,
    save_strategy="epoch",
    # Early stopping related arguments
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="loss",  # Metric to track for saving best model
    greater_is_better=False,       # Set to False since we want to minimize loss
)

def compute_classification_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [117]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-reddit-mlm-final_300_000")

# Your texts and labels lists should be defined here
texts = train_df['self_text'].tolist()
labels = train_df['Label'].tolist()

# Preprocess data
train_encodings, val_encodings, train_labels, val_labels, label_mapping = preprocess_data(texts, labels, tokenizer, False)

# Create datasets
train_dataset = RedditDataset(train_encodings, train_labels)
val_dataset = RedditDataset(val_encodings, val_labels)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [None]:
# Trainer-
trained_model_path = "roberta_after_mlm_finetuned_300.pt"
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_classification_metrics
)

# Train
trainer.train()

trainer.save_model(trained_model_path)
# Evaluate
results = trainer.evaluate()
print(results)

In [119]:
test_texts = test_df['self_text'].tolist()
test_labels = test_df['Label'].tolist()

test_encodings, _, test_labels, _, mapping = preprocess_data(test_texts, test_labels, tokenizer, test=True)
test_dataset = RedditDataset(test_encodings, test_labels)



In [None]:
# Load the model from the saved directory
loaded_model = RobertaForSequenceClassification.from_pretrained('roberta-classification/checkpoint-534')

# Create trainer for the loaded model
eval_trainer = Trainer(
    model=loaded_model,
    args=TrainingArguments(
        output_dir="./eval_results",
        per_device_eval_batch_size=4,
    ),
    compute_metrics=compute_classification_metrics
)

# Evaluate
results = eval_trainer.evaluate(eval_dataset=test_dataset)
print(results)

# Get predictions
predictions = eval_trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# ***Preprocess and try data augmentation***

In [None]:
# Load the dataset
file_path_train = 'train_data.csv'
file_path_test = 'test_data.csv'

train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)


In [None]:
from nlpaug.augmenter.word import SynonymAug

# Initialize with just the essential settings
syn_aug = SynonymAug(
    aug_src='wordnet',
    lang='eng'
)

# Augment data
augmented_texts = []
augmented_labels = []
for text, label in tqdm(zip(train_df['self_text'].tolist()[:-100], train_df['Label'].tolist()[:-100])):
    aug_syn = syn_aug.augment(text)[0]
    augmented_texts.append(aug_syn)
    augmented_labels.append(label)

In [None]:
# Augment data
augmented_texts = []
augmented_labels = []
for text, label in tqdm(zip(train_df['self_text'].tolist()[:-100], train_df['Label'].tolist()[:-100])):
    aug_syn = syn_aug.augment(text)[0]
    augmented_texts.append(aug_syn)
    augmented_labels.append(label)

# Combine original and augmented data
augmention_training_df = pd.DataFrame({'self_text': augmented_texts + train_df['self_text'].tolist(), 'Label': augmented_labels + train_df['Label'].tolist()})


In [None]:
train_df = train_df[['self_text', 'Label']]
test_df = test_df[['self_text', 'Label']]

In [None]:
# len(train_df), len(augmention_training_df), len(test_df)
len(train_df), len(test_df)

In [None]:
# Replace 'text_column' and 'label_column' with actual column names
texts = train_df['self_text'].tolist()
labels = train_df['Label'].tolist()

In [43]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-reddit-mlm-final_100_000")

# Your texts and labels lists should be defined here
texts = train_df['self_text'].tolist()
labels = train_df['Label'].tolist()

# Preprocess data
train_encodings, val_encodings, train_labels, val_labels, label_mapping = preprocess_data(texts, labels, tokenizer, True)

# Create datasets
train_dataset = RedditDataset(train_encodings, train_labels)
val_dataset = RedditDataset(val_encodings, val_labels)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [None]:
# Train
trainer.train()

trainer.save_model(trained_model_path)
# Evaluate
results = trainer.evaluate()
print(results)

# ***Gradual Unfreezing***

## ***Gradual Unfreezing Training***

In [None]:
def preprocess_data(texts, labels, tokenizer, augmention=True, test=False):
    # Convert labels to float first
    labels = [float(label) for label in labels]
    
    # Train-test split
    if test:
        train_texts, val_texts, train_labels, val_labels = texts, texts, labels, labels
    elif augmention: 
        train_texts, val_texts, train_labels, val_labels = texts[:-100], texts[-100:], labels[:-100], labels[-100:]
    else:
        train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)
        
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()
    
    # Convert the float labels to the desired mapping: -1.0 -> 0, 0.0 -> 1, 1.0 -> 2
    mapping = {-1.0: 0, 0.0: 1, 1.0: 2}
    train_labels = [mapping[label] for label in train_labels]
    val_labels = [mapping[label] for label in val_labels]
    
    # Now fit and transform with these mapped labels
    train_labels = label_encoder.fit_transform(train_labels)
    val_labels = label_encoder.transform(val_labels)
    
    # Save the label mapping
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    
    # Tokenize the text
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
    
    return train_encodings, val_encodings, train_labels, val_labels, label_mapping


# Modified optimizer creation function to include weight decay
def create_optimizer_with_discriminative_fine_tuning(model, base_lr=2e-5):
    layer_parameters = []
    
    # Add embeddings with lowest learning rate
    layer_parameters.append({
        'params': model.distilbert.embeddings.parameters(),
        'lr': base_lr/2.6
    })
    
    # Add transformer layers with gradually increasing learning rate
    for layer_idx in range(len(model.distilbert.transformer.layer)):
        layer = model.distilbert.transformer.layer[layer_idx]
        layer_parameters.append({
            'params': layer.parameters(),
            'lr': base_lr/(2.6 - (layer_idx * 0.3))
        })
    
    # Classification head gets the highest learning rate
    layer_parameters.append({
        'params': model.classifier.parameters(),
        'lr': base_lr
    })
    
    # Add weight decay to optimizer
    return AdamW(layer_parameters, weight_decay=0.1)


def train_epoch(model, train_dataloader, val_dataloader, optimizer, num_epochs, patience=3):
    best_val_loss = float('inf')
    patience_counter = 0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, 
                          attention_mask=attention_mask,
                          labels=labels)
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids,
                              attention_mask=attention_mask,
                              labels=labels)
                val_loss += outputs.loss.item()
        
        print(f"Epoch {epoch}: Train Loss = {total_loss/len(train_dataloader):.4f}, Val Loss = {val_loss/len(val_dataloader):.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch}")
                return True  # Signal early stopping
    
    return False  # No early stopping triggered

# Training functions
def train_with_gradual_unfreezing(model, train_dataloader, val_dataloader, epochs_per_unfreeze=3):
    # Add dropout to classification layer
    model.classifier.dropout = nn.Dropout(p=0.3)
    
    # First freeze all layers except the classifier
    for param in model.distilbert.parameters():
        param.requires_grad = False
    
    # Train only the classifier first with weight decay
    optimizer = AdamW(model.classifier.parameters(), lr=2e-5, weight_decay=0.1)
    early_stop = train_epoch(model, train_dataloader, val_dataloader, optimizer, epochs_per_unfreeze)
    if early_stop:
        return model
    
    # Only unfreeze the top 3 layers (instead of all 6)
    for layer_idx in reversed(range(4, 6)): 
        print(f"\nUnfreezing layer {layer_idx}")
        
        # Unfreeze current layer
        for param in model.distilbert.transformer.layer[layer_idx].parameters():
            param.requires_grad = True
        
        # Create optimizer with weight decay
        optimizer = create_optimizer_with_discriminative_fine_tuning(model)
        early_stop = train_epoch(model, train_dataloader, val_dataloader, optimizer, epochs_per_unfreeze)
        if early_stop:
            break
    
    # Load best model before returning
    model.load_state_dict(torch.load('best_model.pt'))
    return model
        
def evaluate_model(model, test_dataset, label_mapping):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
    
    # Convert numeric labels back to original classes
    inv_label_mapping = {v: k for k, v in label_mapping.items()}
    predictions = [inv_label_mapping[pred] for pred in predictions]
    actual_labels = [inv_label_mapping[label] for label in actual_labels]
    
    return predictions, actual_labels


# Convert to PyTorch Dataset
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    

def main():
    # Initialize tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    
    # Your texts and labels lists should be defined here
    texts = train_df['self_text'].tolist()
    labels = train_df['Label'].tolist()
    
    # Preprocess data
    train_encodings, val_encodings, train_labels, val_labels, label_mapping = preprocess_data(texts, labels, tokenizer, True)
    
    # Create datasets
    train_dataset = RedditDataset(train_encodings, train_labels)
    val_dataset = RedditDataset(val_encodings, val_labels)
    
    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    
    # Initialize model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=len(set(train_labels))
    )
    
    # Train model
    train_with_gradual_unfreezing(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        epochs_per_unfreeze=10
    )
    
    # Load best model for evaluation
    model.load_state_dict(torch.load('best_model.pt'))
    
    # Optional: Evaluate on test set
    # test_predictions, test_actual = evaluate_model(model, test_dataset, label_mapping)
    
    return model, label_mapping

if __name__ == "__main__":
    model, label_mapping = main()

## ***Gradual Unfreezing Evaluating***

In [None]:
def evaluate_model(model, test_dataset, label_mapping, batch_size=16):
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    
    # Create DataLoader
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Store predictions, labels, and probabilities
    all_predictions = []
    all_labels = []
    all_probs = []
    
    # Create label name mapping
    label_names = {
        0: "Pro-Palestine",
        1: "Neutral",
        2: "Pro-Israel"
    }
    
    # Evaluate the model
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(outputs.logits, dim=1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    # Convert numeric predictions and labels to label names
    predictions_labels = [label_names[pred] for pred in all_predictions]
    true_labels = [label_names[label] for label in all_labels]
    
    # List of label names in order
    all_classes = ["Pro-Palestine\n(-1)", "Neutral\n(0)", "Pro-Israel\n(1)"]

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions_labels, labels=all_classes, zero_division=0))
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(true_labels, predictions_labels, labels=all_classes)
    
    # Normalize the confusion matrix
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # Plot both raw and normalized confusion matrices
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

    # Raw Confusion Matrix
    sns.heatmap(cm, annot=True, fmt='d',
                xticklabels=all_classes, yticklabels=all_classes, ax=ax1,
                cmap='Blues', cbar_kws={'label': 'Count'})
    ax1.set_title('Confusion Matrix (Raw Counts)', pad=20)
    ax1.set_ylabel('True Label')
    ax1.set_xlabel('Predicted Label')

    # Normalized Confusion Matrix
    sns.heatmap(cm_normalized, annot=True, fmt='.2',
                xticklabels=all_classes, yticklabels=all_classes, ax=ax2,
                cmap='Blues', cbar_kws={'label': 'Proportion'})
    ax2.set_title('Confusion Matrix (Normalized)', pad=20)
    ax2.set_ylabel('True Label')
    ax2.set_xlabel('Predicted Label')

    plt.tight_layout()
    plt.show()
    print(all_predictions)
    print(all_labels)

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Preprocess data
train_encodings, val_encodings, train_labels, val_labels, label_mapping = preprocess_data(texts, labels, tokenizer, True)

# Create datasets
train_dataset = RedditDataset(train_encodings, train_labels)
val_dataset = RedditDataset(val_encodings, val_labels)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(set(train_labels))
)

In [None]:

# Assuming you have your test data prepared similarly to train/val data:
# Your texts and labels lists should be defined here
test_texts = test_df['self_text'].tolist()
test_labels = test_df['Label'].tolist()

test_encodings, _, test_labels, _, mapping = preprocess_data(test_texts, test_labels, tokenizer, augmention=False, test=True)
test_dataset = RedditDataset(test_encodings, test_labels)

# Load the best model
model.load_state_dict(torch.load('best_model.pt', weights_only=True))

# Run evaluation
results = evaluate_model(model, test_dataset, label_mapping)

In [None]:
# DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# RedditBerTokenizer
#tokenizer = AutoTokenizer.from_pretrained("Fan-s/reddit-tc-bert", use_fast=True)

In [None]:
train_encodings, val_encodings, train_labels, val_labels = preprocess_data(texts, labels, tokenizer, True)

In [None]:
train_dataset = RedditDataset(train_encodings, train_labels)
val_dataset = RedditDataset(val_encodings, val_labels)

## ***Finetuning***

In [None]:
!pip install accelerate -U

In [None]:
# Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(set(labels)))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    # Early stopping related arguments
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="loss",  # Metric to track for saving best model
    greater_is_better=False,       # Set to False since we want to minimize loss
)

In [None]:
# Evaluation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings, val_encodings, train_labels, val_labels, label_mapping = preprocess_data(texts, labels, tokenizer, True)

train_dataset = RedditDataset(train_encodings, train_labels)
val_dataset = RedditDataset(val_encodings, val_labels)

In [None]:
# Trainer
trained_model_path = "distilbert_model_finetuned.pt"
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

trainer.save_model(trained_model_path)
# Evaluate
results = trainer.evaluate()
print(results)

In [None]:
test_texts = test_df['self_text'].tolist()
test_labels = test_df['Label'].tolist()

test_encodings, _, test_labels, _, mapping = preprocess_data(test_texts, test_labels, tokenizer, augmention=False, test=True)
test_dataset = RedditDataset(test_encodings, test_labels)

In [None]:
# Load the model from the saved directory
loaded_model = DistilBertForSequenceClassification.from_pretrained('distilbert_model_finetuned')

# Create trainer for the loaded model
eval_trainer = Trainer(
    model=loaded_model,
    args=TrainingArguments(
        output_dir="./eval_results",
        per_device_eval_batch_size=16,
    ),
    compute_metrics=compute_metrics
)

# Evaluate
results = eval_trainer.evaluate(eval_dataset=test_dataset)
print(results)

# Get predictions
predictions = eval_trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

In [None]:
labels = predictions.label_ids

# Correct label mapping
label_names = {
    0: "Pro-Palestine",
    1: "Neutral",
    2: "Pro-Israel"
}

# Create confusion matrix
cm = confusion_matrix(labels, preds)

# Normalize the confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,8))

# Plot raw counts
sns.heatmap(cm, annot=True, fmt='d', 
            xticklabels=label_names.values(), 
            yticklabels=label_names.values(),
            ax=ax1)
ax1.set_title('Confusion Matrix (Raw Counts)')
ax1.set_ylabel('True Label')
ax1.set_xlabel('Predicted Label')

# Plot normalized values
sns.heatmap(cm_normalized, annot=True, fmt='.2', 
            xticklabels=label_names.values(), 
            yticklabels=label_names.values(),cmap='Blues', cbar_kws={'label': 'Proportion'},
            ax=ax2)
ax2.set_title('Confusion Matrix (Normalized)')
ax2.set_ylabel('True Label')
ax2.set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

# Print classification report
from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(labels, preds, 
                          target_names=list(label_names.values()),
                          zero_division=0))