**CS-7643 - Deep Learning - Summer 2024 - Final Project - news_dataset_adaptertuning.ipynb**

**Group - Big Daaata**

Use SoTA models to the test on a News Category Classification dataset and compare the performance of a fine-tuned model against an adapter-based model. Being a Kaggle dataset from real-world data, it will help us quantify the advantages, if any, of the adapter-based models in a practical context.

This notebook has code to run adapter based tuning experiments for one of the models 
* distilbert-base-uncased

# Installs

In [1]:
!pip3 install datasets
!pip3 install evaluate
!pip3 install transformers
!pip3 install adapters
!pip3 install mlflow

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Using cached transformers-4.43.1-py3-none-any.whl (9.4 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.2
    Uninstalling transformers-4.40.2:
      Successfully uninstalled transformers-4.40.2
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
adapters 0.2.2 requires transformers~=4.40.2, but you have transformers 4.43.1 which is incompatible.[0m[31m
[0mSuccessfully installed transformers-4.43.1
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers~=4.40.2
  Usin

In [2]:
!pip show transformers
!pip show adapters
!pip show peft

Name: transformers
Version: 4.40.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /home/hice1/ksingh362/.local/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: adapters, peft
Name: adapters
Version: 0.2.2
Summary: A Unified Library for Parameter-Efficient and Modular Transfer Learning
Home-page: https://github.com/adapter-hub/adapters
Author: The AdapterHub team and community contributors
Author-email: calpt@mail.de
License: Apache
Location: /home/hice1/ksingh362/.local/lib/python3.10/site-packages
Requires: transformers
Required-by: 
Name: peft
Version: 0.12.0
Summary: Param

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_metric
import numpy as np
import mlflow

# Load the dataset
data_path = '/content/drive/MyDrive/CS7643-final-project/News_Category_Dataset_v3.json'  # set path to dataset
df = pd.read_json(data_path, lines=True)

# Keep relevant columns
df = df[['category', 'headline']]

# Encode labels
label_mapping = {label: idx for idx, label in enumerate(df['category'].unique())}
df['label'] = df['category'].map(label_mapping)

# Split dataset
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'])
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'])

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['headline', 'label']])
val_dataset = Dataset.from_pandas(val_df[['headline', 'label']])
test_dataset = Dataset.from_pandas(test_df[['headline', 'label']])

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Check dataset
print(dataset)

num_labels = len(df['label'].unique())
print(f'Number of labels: {num_labels}')


DatasetDict({
    train: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 150858
    })
    validation: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 16763
    })
    test: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 41906
    })
})
Number of labels: 42


In [None]:
import torch
import numpy as np
import pandas as pd
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,  default_data_collator
from transformers import TrainerCallback, AutoConfig
import datasets
import matplotlib.pyplot as plt

from adapters import (
    AdapterArguments,
    AdapterTrainer,
    AutoAdapterModel,
    setup_adapter_training,   
    AdapterConfig, BnConfig, PrefixTuningConfig, MAMConfig, CompacterPlusPlusConfig,
    UniPELTConfig, IA3Config, LoRAConfig, PromptTuningConfig, ConfigUnion
)

# Define model names
model_names = ['google/electra-base-discriminator']

# Define adapter configurations
string_to_config = {
#     'seq_bn': BnConfig(mh_adapter=True, output_adapter=True, reduction_factor=16, non_linearity="relu"),
#     'prefix_tuning': PrefixTuningConfig(flat=False, prefix_length=30),
#     'lora': LoRAConfig(r=8, alpha=16),
    "mam_adapter": MAMConfig(),
    'seq_bn_16_2_relu': ConfigUnion(
        BnConfig(mh_adapter=True, output_adapter=False, reduction_factor=16, non_linearity="relu"),
        BnConfig(mh_adapter=False, output_adapter=True, reduction_factor=2, non_linearity="relu"),
    ),
}


# Load metric
metric = load_metric('accuracy')


# Function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    logits = torch.tensor(logits)
    predictions = torch.argmax(logits, dim=-1).numpy()
    labels = np.array(labels)
    return metric.compute(predictions=predictions, references=labels)

# Initialize DataFrame to store results
results_df = pd.DataFrame(columns=['Model', 'Adapter', 'Epoch', 'Training Loss', 'Validation Loss', 'Accuracy'])

# Callback to record metrics after each epoch
class MetricsCallback(TrainerCallback):
    def __init__(self, adapter):
        self.adapter = adapter
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_world_process_zero:
            logs = logs or {}
            epoch = state.epoch
            model_name = args.output_dir.split('/')[-1]
            adapter_name = args.label_names if self.adapter else 'none'
            training_loss = logs.get('loss', None)
            validation_loss = logs.get('eval_loss', None)
            accuracy = logs.get('eval_accuracy', None)
            new_row = pd.DataFrame([[model_name, adapter_name, epoch, training_loss, validation_loss, accuracy]], columns=['Model', 'Adapter', 'Epoch', 'Training Loss', 'Validation Loss', 'Accuracy'])
            if not new_row.isna().all(axis=1).any():
                global results_df
                results_df = pd.concat([results_df, new_row], ignore_index=True)

# Function to fine-tune a model
def fine_tune_model(model_name, adapter=False, adapter_type=None):
    global results_df

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if adapter:
#         model = AutoAdapterModel.from_pretrained(model_name)
#         model.add_classification_head("mrpc", num_labels=42)
#         model.add_adapter("mrpc", config=adapter_type)
#         model.set_active_adapters("mrpc")
        model = AutoAdapterModel.from_pretrained(model_name)
        model.add_classification_head(adapter_type, num_labels=42)
        adapter_config = string_to_config.get(adapter_type)
        if adapter_type=='prefix_tuning':
            model.eject_prefix_tuning("prefix_tuning")
        model.add_adapter(adapter_type, config=adapter_config)
        model.train_adapter(adapter_type)
        model.set_active_adapters(adapter_type)

            
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping))
    # Tokenization function
    def tokenize_function(examples):
        return tokenizer(examples['headline'], truncation=True, padding='max_length', max_length=128)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        evaluation_strategy='epoch',
        learning_rate=7e-6,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=10,
        weight_decay=0.02,
        save_total_limit=1,
        eval_steps=1,
        save_strategy='epoch',
        logging_strategy='steps',
        logging_first_step=True,
        load_best_model_at_end=True,
        logging_steps=1,
        report_to="mlflow",
    )
    
    # Data collator
    data_collator = default_data_collator
    
    # Trainer
    trainer_class = AdapterTrainer if adapter else Trainer
    trainer = trainer_class(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        compute_metrics=compute_metrics,
        callbacks=[MetricsCallback(adapter_type)]
    )
    
    # Add print statements to debug batch sizes
    for step, batch in enumerate(trainer.get_train_dataloader()):
        inputs, labels = batch["input_ids"], batch["labels"]
        print(f"Batch {step}: input size {inputs.size()}, label size {labels.size()}")
        if step == 1:  # Print only the first batch
            break


    # Train model
    trainer.train()

    # Evaluate model
    eval_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'])
    print(f"Evaluation results for {model_name}: {eval_results}")



# Fine-tune each model
# for model_name in model_names:
#     fine_tune_model(model_name)
    
# # Fine-tune each model with each adapter
# for model_name in model_names:
#     for adapter_type in string_to_config.keys():
#         print("model_name: ", model_name, " - adapter_type: ", adapter_type )
#         fine_tune_model(model_name, adapter=True, adapter_type=adapter_type)


# Plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Function to plot learning curves for all models and adapters
def plot_learning_curves(results_df):
    # Get unique model and adapter names
    model_names = results_df['Model'].unique()
    adapter_names = results_df['Adapter'].unique()

    for model_name in model_names:
        for adapter_name in adapter_names:
            model_adapter_results = results_df[(results_df['Model'] == model_name) & (results_df['Adapter'] == adapter_name)]

            if model_adapter_results.empty:
                continue

            # Round the epoch values to ensure proper grouping
            model_adapter_results['Epoch'] = model_adapter_results['Epoch'].round()

            # Filter to get the last training loss entry per epoch
            avg_training_loss_per_epoch = model_adapter_results.dropna(subset=['Training Loss']).groupby('Epoch')['Training Loss'].mean().reset_index()
            last_validation_loss_per_epoch = model_adapter_results.dropna(subset=['Validation Loss']).groupby('Epoch')['Validation Loss'].last().reset_index()

            # Merge the training and validation losses
            merged_results = pd.merge(avg_training_loss_per_epoch, last_validation_loss_per_epoch, on='Epoch', how='outer')

            plt.plot(merged_results['Epoch'], merged_results['Training Loss'], label='Training Loss', marker='o')
            plt.plot(merged_results['Epoch'], merged_results['Validation Loss'], label='Validation Loss', marker='o')

            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.title(f'Learning Curves for {model_name} with Adapter {adapter_name}')
            plt.legend()
            plt.grid(True)
            plt.show()




In [None]:
# Fine-tune each model with each adapter
for model_name in model_names:
    for adapter_type in string_to_config.keys():
        print("model_name: ", model_name, " - adapter_type: ", adapter_type )
        fine_tune_model(model_name, adapter=True, adapter_type=adapter_type)


In [None]:
results_df.to_csv('model_results.csv', index=False)
plot_learning_curves(results_df)