#Setting

In [None]:
%pip install "torch==2.2.2" tensorboard
%pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


#Prepare Emotion data

In [None]:
filepath="./data/"

In [None]:
train = pd.read_csv(filepath+"train.csv")
test = pd.read_csv(filepath+"test.csv")
go_emotion = pd.read_csv(filepath+"goemotion.csv")

In [None]:
neg_emotion=['gratitude','joy']
pos_emotion=['anger','disgust']
fake_emotion=['surprise']

In [None]:
def emotion_training_data(Pos_num,Neg_num,Fake_num):
  emotions_positive=[]
  emotions_negative=[]
  emotions_fake=[]
  for i in range(len(go_emotion)):
    for e in pos_emotion:
     if go_emotion.iloc[i][e]==1:
        emotions_positive.append(go_emotion.text[i])
    for e in neg_emotion:
     if go_emotion.iloc[i][e]==1:
        emotions_negative.append(go_emotion.text[i])
    for e in fake_emotion:
     if go_emotion.iloc[i][e]==1:
        emotions_fake.append(go_emotion.text[i])
  #take away multi-labels from original file
  same=[e for e in emotions_negative if e  in emotions_positive or e in emotions_fake]
  emotions_negative_new=[e for e in emotions_negative if e not in same]
  emotions_positive_new=[e for e in emotions_positive if e not in same]
  emotions_fake_new=[e for e in emotions_fake if e not in same]
  emotion_contents=emotions_positive_new[:Pos_num]+list(emotions_negative_new[:Neg_num])+list(emotions_fake_new[:Fake_num])
  emotions_label=[1]*Pos_num+[0]*Neg_num+[2]*Fake_num
  return emotion_contents,emotions_label

In [None]:
emotion_contents,emotions_label=emotion_training_data(2000,1500,200)
train_emotion = pd.DataFrame(emotion_contents)
train_emotion['label'] = emotions_label
train_emotion_shuffle=train_emotion.sample(frac=1)
train_emotion = pd.DataFrame(list(zip(emotion_contents, emotions_label)),
               columns =['Comments', 'label'])
train_emotion_shuffle=train_emotion.sample(frac=1)
train_emotion_shuffle.head()

#Define main function

In [None]:
def get_performance_metrics(df_test):
  y_test = df_test.label
  y_pred = df_test.predictions

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

def llama_preprocessing_function(examples):
    return tokenizer(examples['Comments'], truncation=True, max_length=MAX_LEN)

In [None]:
#define which metrics to compute for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

In [None]:
#define custom trainer
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

def make_predictions(model,df_test):


  # Convert summaries to a list
  sentences = df_test.Comments.tolist()

  # Define the batch size
  batch_size = 32  # You can adjust this based on your system's memory capacity

  # Initialize an empty list to store the model outputs
  all_outputs = []

  # Process the sentences in batches
  for i in range(0, len(sentences), batch_size):
      # Get the batch of sentences
      batch_sentences = sentences[i:i + batch_size]

      # Tokenize the batch
      inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

      # Move tensors to the device where the model is (e.g., GPU or CPU)
      inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

      # Perform inference and store the logits
      with torch.no_grad():
          outputs = model(**inputs)
          all_outputs.append(outputs['logits'])
  final_outputs = torch.cat(all_outputs, dim=0)
  df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
  #df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])


#Define Model&Training

In [None]:
#model_name = "meta-llama/Meta-Llama-3-8B"
model_name= "meta-llama/Llama-2-7b-hf"

In [None]:
def define_model(model_name):
  quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
  )
  lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
  )
  model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=3
  )
  #prepare_model_for_kbit_training() function to preprocess the quantized model for training.
  model = prepare_model_for_kbit_training(model)
  #get_peft_model prepares a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with
  model = get_peft_model(model, lora_config)

  #Load tokennizer
  tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
  tokenizer.pad_token_id = tokenizer.eos_token_id
  tokenizer.pad_token = tokenizer.eos_token

  #Update some model configs
  model.config.pad_token_id = tokenizer.pad_token_id
  model.config.use_cache = False
  model.config.pretraining_tp = 1

  MAX_LEN = 512
  training_args = TrainingArguments(
    output_dir = 'emotion_cyberbullying',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True)
  return model, tokenizer, training_args

In [None]:
def training(dataset,train_class,model,tokenizer,training_args):
  class_weights=(1/train_class.label.value_counts(normalize=True).sort_index()).tolist()
  class_weights=torch.tensor(class_weights)
  class_weights=class_weights/class_weights.sum()

  tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
  tokenized_datasets.set_format("torch")
  collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

  trainer = CustomTrainer(
      model = model,
      args = training_args,
      train_dataset = tokenized_datasets['train'],
      eval_dataset = tokenized_datasets['val'],
      tokenizer = tokenizer,
      data_collator = collate_fn,
      compute_metrics = compute_metrics,
      class_weights=class_weights,
  )
  train_result = trainer.train()

  return model, trainer,train_result


#Baseline

In [None]:
df_train=train
df_value=train
df_test=test

dataset_train = Dataset.from_pandas(train)
dataset_val = Dataset.from_pandas(train[:10])
dataset_test = Dataset.from_pandas(test)

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
    'test': dataset_test
  })

model_new,tokenizer_new, training_args_new=define_model(model_name)
model_train,trainer,_=training(dataset,df_train,model_new,tokenizer_new, training_args_new)
make_predictions(model_train,df_test)
get_performance_metrics(df_test)

#ZCS

In [None]:

df_train=train_emotion_shuffle
df_value=train_emotion_shuffle[:10]
df_test=test

dataset_train = Dataset.from_pandas(train_emotion_shuffle)
dataset_val = Dataset.from_pandas(train_emotion_shuffle[:10])
dataset_test = Dataset.from_pandas(test)

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
    'test': dataset_test
  })

model_new,tokenizer_new, training_args_new=define_model(model_name)
model_train_zsc,trainer,_=training(dataset,df_train,model_new,tokenizer_new, training_args_new)
make_predictions(model=model_train_zsc,df_test=df_test)
get_performance_metrics(df_test)

#FSC-based on ZSC for fine tunning

In [None]:
df_train=train
df_value=train
df_test=test

dataset_train = Dataset.from_pandas(train)
dataset_val = Dataset.from_pandas(train[:10])
dataset_test = Dataset.from_pandas(test)


dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
    'test': dataset_test
  })

model_train_FCS,trainer_FCS,_=training(dataset=dataset,train_class=df_train,model=model_train_zsc,tokenizer=tokenizer_new,training_args=training_args_new)
make_predictions(model=model_train_FCS,df_test=df_test)
get_performance_metrics(df_test)

## Saving the model

In [None]:
metrics = train_result.metrics
max_train_samples = len(dataset_train)
metrics["train_samples"] = min(max_train_samples, len(dataset_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
trainer.save_model("saved_model")