# Base Model

Attempting to recreate the validation F1 of the following model using stochastic federated averaging.
https://huggingface.co/JeremiahZ/roberta-base-mrpc

# Dependencies

GPU - T4

Note : Force Update accelerate since it defaults to the old version.
Restart the notebook once the below cell has executed.

In [2]:
%%capture
!python -m pip install transformers 
!python -m pip install evaluate 
!python -m pip install datasets 
!python -m pip install accelerate
!python -m pip install torch
!python -m pip install evaluate

In [3]:
from datasets import load_dataset, load_from_disk
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, \
 AutoConfig, MobileBertConfig, TrainingArguments, DataCollatorWithPadding, RobertaForSequenceClassification
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch.cuda.is_available()

True

In [5]:
torch.cuda.device_count()

1

In [6]:
torch.cuda.current_device()

0

In [7]:
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

# Defining Training Functions
- Import dataset
- Import Base Model and tokenizer
- Define Training Hyperparameters
- Train Model

In [8]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def init_datasets(dataset_path, tokenizer, is_sharded):

    def tokenize_function(example):
      return tokenizer(example["sentence1"], example["sentence2"])

    if is_sharded:
      train_dataset = load_from_disk(dataset_path["train_path"] + dataset_path["dataset_id"]
                                     + dataset_path["train_file_name"])
      validation_dataset = load_from_disk(dataset_path["validation_path"] + dataset_path["dataset_id"]
                                          + dataset_path["validation_file_name"])
    else:
      raw_datasets = load_dataset("glue", "mrpc")
      train_dataset = raw_datasets["train"]
      validation_dataset = raw_datasets["validation"]

    tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
    tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)

    return tokenized_train_dataset, tokenized_validation_dataset


def save_model(tokenizer, model, tokenized_train_dataset, tokenized_validation_dataset, training_args, save_path):

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = Trainer(
      model,
      training_args,
      train_dataset=tokenized_train_dataset,
      eval_dataset=tokenized_validation_dataset,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      data_collator=data_collator
      )
    trainer.save_model(save_path)


def init_local_models(tokenized_train_dataset, tokenized_validation_dataset, tokenizer, model_save_path, base_model_name):


    model = AutoModelForSequenceClassification.from_pretrained(base_model_name)

    training_args = TrainingArguments(model_save_path , num_train_epochs=1, learning_rate=2e-05,
                                  lr_scheduler_type="linear", per_device_train_batch_size=16, seed=42,
                                  per_device_eval_batch_size=8, warmup_ratio=0.06)

    save_model(tokenizer, model, tokenized_train_dataset, tokenized_validation_dataset,
               training_args, model_save_path)






def init_global_model(tokenized_train_dataset, tokenized_validation_dataset, tokenizer, model_save_path, dataset_id, base_model_name):
  federated_model = RobertaForSequenceClassification.from_pretrained(base_model_name)

  federated_model.classifier.dense.bias.data = torch.zeros(federated_model.classifier.dense.bias.data.size())
  federated_model.classifier.dense.weight.data = torch.zeros(federated_model.classifier.dense.weight.data.size())
  federated_model.classifier.out_proj.bias.data = torch.zeros(federated_model.classifier.out_proj.bias.data.size())
  federated_model.classifier.out_proj.weight.data = torch.zeros(federated_model.classifier.out_proj.weight.data.size())

  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
  training_args = TrainingArguments(model_save_path , num_train_epochs=1, learning_rate=2e-05,
                                lr_scheduler_type="linear", per_device_train_batch_size=16, seed=42,
                                per_device_eval_batch_size=8, warmup_ratio=0.06)


  save_model(tokenizer, federated_model, tokenized_train_dataset, tokenized_validation_dataset,
               training_args, model_save_path)



In [None]:
base_model_name = "roberta-base"
train_path = "Federated Split-20231015T210206Z-001/Federated Split/train/"
validation_path = "Federated Split-20231015T210206Z-001/Federated Split/validation/"
model_save_path = "Federated SGD/"

train_file_name = "/train.hf"
validation_file_name = "/validation.hf"

num_shards = 8
sharded_dict = {"train_path": train_path,
                "train_file_name":train_file_name,
                "validation_path": validation_path,
                "validation_file_name": validation_file_name
                }


for dataset_id in range(num_shards):
  tokenizer = AutoTokenizer.from_pretrained(base_model_name)
  sharded_dict["dataset_id"] = str(dataset_id)
  tokenized_train_dataset, tokenized_validation_dataset  = \
        init_datasets(dataset_path=sharded_dict, tokenizer=tokenizer, is_sharded=True)

  init_local_models(tokenized_train_dataset, tokenized_validation_dataset, tokenizer, model_save_path+str(dataset_id), base_model_name)


global_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
global_tokenized_train_dataset, global_tokenized_validation_dataset  = \
      init_datasets(dataset_path=base_model_name, tokenizer=global_tokenizer, is_sharded=False)
init_global_model(global_tokenized_train_dataset, global_tokenized_validation_dataset, tokenizer,
                  model_save_path +"/federated_learner", str(dataset_id), base_model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train_local_model(train_path, validation_path, model_save_path, model_id):

  train_dataset = load_from_disk(train_path)
  validation_dataset = load_from_disk(validation_path)

  tokenizer = AutoTokenizer.from_pretrained("roberta-base")
  model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

  def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
  tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)

  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  training_args = TrainingArguments(model_save_path, num_train_epochs=1,
                                    learning_rate=2e-05,
                                    lr_scheduler_type="linear",
                                    per_device_train_batch_size=32,
                                    seed=42, per_device_eval_batch_size=16, warmup_ratio=0.06)

  trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
    )
  trainer.train()

  predictions = trainer.predict(tokenized_validation_dataset)
  preds = np.argmax(predictions.predictions, axis=-1)
  metric = evaluate.load("glue", "mrpc")
  validation_scores[model_id].append(metric.compute(predictions=preds, references=predictions.label_ids))
  trainer.save_model(model_save_path)


In [None]:
def compute_FedSGD(model_base_path, num_shards, global_model_name):
  models = []
  for model_number in range(num_shards):
    models.append(RobertaForSequenceClassification.from_pretrained(model_save_path+ str(model_number)))

  federated_model = RobertaForSequenceClassification.from_pretrained(model_base_path + global_model_name)

  #adding every model weight to the fed model
  for model in models:
    federated_model.classifier.dense.bias.data = torch.add(federated_model.classifier.dense.bias.data, model.classifier.dense.bias.data)
    federated_model.classifier.dense.weight.data = torch.add(federated_model.classifier.dense.weight.data, model.classifier.dense.weight.data)

    federated_model.classifier.out_proj.bias.data = torch.add(federated_model.classifier.out_proj.bias.data, model.classifier.out_proj.bias.data)
    federated_model.classifier.out_proj.weight.data = torch.add(federated_model.classifier.out_proj.weight.data, model.classifier.out_proj.weight.data)

  federated_model.classifier.dense.bias.data = torch.div(federated_model.classifier.dense.bias.data, num_shards)
  federated_model.classifier.dense.weight.data = torch.div(federated_model.classifier.dense.weight.data, num_shards)
  federated_model.classifier.out_proj.bias.data = torch.div(federated_model.classifier.out_proj.bias.data, num_shards)
  federated_model.classifier.out_proj.weight.data = torch.div(federated_model.classifier.out_proj.weight.data, num_shards)


  #rewriting the avg to every model weight
  for i, model in enumerate(models):
    models[i].classifier.dense.bias.data = federated_model.classifier.dense.bias.data
    models[i].classifier.dense.weight.data = federated_model.classifier.dense.weight.data

    models[i].classifier.out_proj.bias.data = federated_model.classifier.out_proj.bias.data
    models[i].classifier.out_proj.weight.data = federated_model.classifier.out_proj.weight.data

    models[i].save_pretrained(model_base_path + str(i))


  raw_datasets = load_dataset("glue", "mrpc")
  train_dataset = raw_datasets["train"]
  validation_dataset = raw_datasets["validation"]

  def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"])

  tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
  tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)

  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  training_args = TrainingArguments(model_save_path, num_train_epochs=1,
                                    learning_rate=2e-05,
                                    lr_scheduler_type="linear",
                                    per_device_train_batch_size=32,
                                    seed=42, per_device_eval_batch_size=16, warmup_ratio=0.06)

  trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
    )
  predictions = trainer.predict(tokenized_validation_dataset)
  preds = np.argmax(predictions.predictions, axis=-1)
  metric = evaluate.load("glue", "mrpc")
  validation_scores[global_model_name].append(metric.compute(predictions=preds, references=predictions.label_ids))
  trainer.save_model(model_base_path + global_model_name)


Passing all the paths and calling on the sharded dataset.

In [None]:

from collections import defaultdict
validation_scores = defaultdict(list)
global_model_name = "federated_learner"

base_model_name = "roberta-base"
train_path = "Federated Split-20231015T210206Z-001/Federated Split/train/"
validation_path = "Federated Split-20231015T210206Z-001/Federated Split/validation/"
model_save_path = "Federated SGD/"

train_file_name = "/train.hf"
validation_file_name = "/validation.hf"

num_shards = 8
num_cycle = 10

for cycle in range(num_cycle):

  for dataset_id in range(num_shards):
    train_local_model(train_path+str(dataset_id)+train_file_name,
                      validation_path+str(dataset_id)+validation_file_name
                      ,model_save_path+str(dataset_id), dataset_id)


  compute_FedSGD(model_save_path, num_shards, global_model_name)

  print("Model Validation Scores after Cycle {} ".format(cycle),validation_scores)

In [None]:

validation_scores["federated_learner"]