In [None]:
# Install Pytorch
%pip install "torch==2.2.2" tensorboard

# Install Hugging Face libraries
%pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"


In [None]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict, load_dataset
import json
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import login
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

In [None]:
login()

In [None]:

# Load dataset
dataset = load_dataset("DonManiek/ICFraud")
print(dataset)

# Check if the dataset is loaded correctly
print(dataset['train'][0])

# Create a mapping from string labels to integers
label_to_int = {label: i for i, label in enumerate(set(dataset['train']['label']))}
int_to_label = {v: k for k, v in label_to_int.items()}

# Convert the label column to category codes
def encode_labels(example):
    example['target'] = label_to_int[example['label']]
    return example

# Apply encoding function to the train dataset
encoded_dataset = dataset.map(encode_labels)
print(encoded_dataset)

# Split the encoded dataset into train, validation, and test sets
# First, split into train (60%) and temp (40%)
train_temp_split = encoded_dataset['train'].train_test_split(test_size=0.4, seed=42)

# Further split the temp set into validation (50% of temp) and test (50% of temp), which results in 20% each of the original dataset
temp_split = train_temp_split['test'].train_test_split(test_size=0.5, seed=42)

train_data = train_temp_split['train']
val_data = temp_split['train']
test_data = temp_split['test']

# Create a DatasetDict with the train, validation, and test datasets
dataset_dict = DatasetDict({
    'train': train_data,
    'validation': val_data,
    'test': test_data
})

# Print first few examples of train, validation, and test sets to verify
print(json.dumps(dataset_dict['train'][:5], indent=4))
print(json.dumps(dataset_dict['validation'][:5], indent=4))
print(json.dumps(dataset_dict['test'][:5], indent=4))

train_targets = dataset_dict['train']['target']

class_counts = pd.Series(train_targets).value_counts()
class_weights = 1.0/class_counts
class_weights = class_weights / class_weights.sum()

class_weights_tensor = torch.tensor(class_weights.values, dtype=torch.float)

print("Class weights:", class_weights)
print("Class weights tensor:", class_weights_tensor)

In [None]:
label_to_int

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

In [None]:
dataset_trained_shuffled = train_data.shuffle(seed=42)

In [None]:
dataset = DatasetDict({
    'train': dataset_trained_shuffled,
    'validation': val_data,
    'test': test_data
})
dataset

In [None]:
dataset['train']

In [None]:
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=4
)

model

In [None]:
model = prepare_model_for_kbit_training(model)
model

In [None]:
model = get_peft_model(model, lora_config)
model

In [None]:
df_train = dataset['train'].to_pandas() #not sure about that
df_test = dataset['test'].to_pandas()
df_val = dataset['validation'].to_pandas()

In [None]:
df_train['label']=df_train['label'].astype('category')
df_train['target']=df_train['label'].cat.codes

df_test['label']=df_test['label'].astype('category')
df_test['target']=df_test['label'].cat.codes

df_val['label']=df_val['label'].astype('category')
df_val['target']=df_val['label'].cat.codes

In [None]:
category_map = {code: category for code, category in enumerate(df_train['label'].cat.categories)}
category_map

In [None]:
category_map = {code: category for code, category in enumerate(df_test['label'].cat.categories)}
category_map

In [None]:
category_map = {code: category for code, category in enumerate(df_val['label'].cat.categories)}
category_map

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
sentences = df_test.text.tolist()
sentences[0:2]

In [None]:
# Convert summaries to a list
sentences = df_test.text.tolist()

# Define the batch size
batch_size = 16  # You can adjust this based on your system's memory capacity

# Initialize an empty list to store the model outputs
all_outputs = []

# Process the sentences in batches
for i in range(0, len(sentences), batch_size):
    # Get the batch of sentences
    batch_sentences = sentences[i:i + batch_size]

    # Tokenize the batch
    inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move tensors to the device where the model is (e.g., GPU or CPU)
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    # Perform inference and store the logits
    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])

In [None]:
final_outputs = torch.cat(all_outputs, dim=0)
final_outputs

In [None]:
final_outputs.argmax(axis=1)

In [None]:
df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
df_test['predictions']

In [None]:
df_test['predictions'].value_counts()

In [None]:
df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])
df_test['predictions']

In [None]:
def get_performance_metrics(df_test):
  y_test = df_test.label
  y_pred = df_test.predictions

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
get_performance_metrics(df_test)

In [None]:
MAX_LEN = 512
col_to_delete = ['label', 'text']

def llama_preprocessing_function(examples):
    # Ensure examples is a list of strings and handle empty strings
    examples_list = examples['text']
    example_list = [x if x is not None and x.strip() else "" for x in examples_list]  # Replace None and empty strings with ""

    # Tokenize using encode_batch
    tokenized_examples = tokenizer.batch_encode_plus(
        example_list,
        truncation=True,
        max_length=MAX_LEN,
        padding='max_length',  # Ensure padding to max_length
        return_attention_mask=True,
        return_tensors='pt'  # Return PyTorch tensors
    )

    # Convert tensors to lists
    tokenized_examples = {key: value.tolist() for key, value in tokenized_examples.items()}

    return tokenized_examples

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir = 'sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

In [None]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights,
)

In [None]:
train_result = trainer.train()

In [None]:
def make_predictions(model,df_test):


  # Convert summaries to a list
  sentences = df_test.text.tolist()

  # Define the batch size
  batch_size = 16  # You can adjust this based on your system's memory capacity

  # Initialize an empty list to store the model outputs
  all_outputs = []

  # Process the sentences in batches
  for i in range(0, len(sentences), batch_size):
      # Get the batch of sentences
      batch_sentences = sentences[i:i + batch_size]

      # Tokenize the batch
      inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

      # Move tensors to the device where the model is (e.g., GPU or CPU)
      inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

      # Perform inference and store the logits
      with torch.no_grad():
          outputs = model(**inputs)
          all_outputs.append(outputs['logits'])
  final_outputs = torch.cat(all_outputs, dim=0)
  df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
  df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])


make_predictions(model,df_test)

In [None]:
get_performance_metrics(df_test)

In [None]:
dataset_train = dataset['train']
metrics = train_result.metrics
max_train_samples = len(dataset_train)
metrics["train_samples"] = min(max_train_samples, len(dataset_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
trainer.save_model("saved_model")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r sentiment_classification /content/drive/MyDrive/

In [None]:
!cp -r sentiment_classification /content/drive/MyDrive/