In [0]:
#!pip install -r requirements.txt

In [0]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'

In [0]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from llm_main.feature_engineer import DataFrameSplitter
from llm_main.prompt_engineer import PromptGenerator
from llm_main.model_base import QuantizedBaseModelInitializer
from llm_main.predict import ModelPredictor
from llm_main.evaluation import ModelEvaluator
from llm_main.train_with_fine_tuning import PEFTModelTrainer
from llm_main.model_fine_tune import ModelReloader
from datasets import load_dataset
import gc

## 1 Model training and testing pipeline

In [0]:
def clear_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

def Preprocess_data(data_type, local=True, label_col="label", text_col="text", target="sentiment",version=None,split="train"):
    #Every dataset has 2 columns with "target" and "text"
    # Check if the folder exists
    folder_path='data/'+data_type
    if not os.path.exists(folder_path):
        # Create the folder if it doesn't exist
        os.makedirs(folder_path)
    if local:
        #sentiment of news: https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis/input
        file_path=folder_path+'/all_data.csv'
        df = pd.read_csv(file_path, encoding="utf-8", encoding_errors="replace")
    else:
        if version is None:
            df = load_dataset(data_type, split=split)
        else:
            df = load_dataset(data_type, version, split=split)

        df=df.to_pandas()
        df=df[[label_col,text_col]]
        df.columns = [target,"text"]
        
    splitter = DataFrameSplitter(df, train_size=300)

    # Perform the split to obtain train, test, and eval sets
    X_train_df, X_test_df, X_eval_df = splitter.split()
    X_train_df.to_csv(folder_path+"/X_train_df.csv", index=False)
    X_test_df.to_csv(folder_path+"/X_test_df.csv", index=False)
    X_eval_df.to_csv(folder_path+"/X_eval_df.csv", index=False)

def read_dataset(data_type):
    folder_path='data/'+data_type
    X_train_df=pd.read_csv(folder_path+"/X_train_df.csv")
    X_test_df=pd.read_csv(folder_path+"/X_test_df.csv")
    X_eval_df=pd.read_csv(folder_path+"/X_eval_df.csv")
    return X_train_df, X_test_df, X_eval_df

def training_prompt(X_train_df, X_test_df, X_eval_df):
    target=X_train_df.columns[0]
    prompt_generator = PromptGenerator(target)
    # Generate training and validation prompts
    X_train_prompt = prompt_generator.generate_dataframe_prompts(X_train_df, prompt_type='train')
    X_eval_prompt = prompt_generator.generate_dataframe_prompts(X_eval_df, prompt_type='train')
    X_test_prompt = prompt_generator.generate_dataframe_prompts(X_test_df, prompt_type='test')
    return X_train_prompt, X_test_prompt, X_eval_prompt
  

def fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt,target_modules="all-linear"):
    clear_gpu()
    # Create an instance of QuantizedBaseModelInitializer
    initializer = QuantizedBaseModelInitializer(base_model_name)
    # Initialize the model and tokenizer with quantization
    base_model, tokenizer = initializer.initialize()
    train_data = Dataset.from_pandas(X_train_prompt)
    eval_data = Dataset.from_pandas(X_eval_prompt)
    trained_model_name = "trained_model/"+data+"/"+base_model_name
    os.makedirs(trained_model_name, exist_ok=True)
    trainer = PEFTModelTrainer(base_model, tokenizer, train_data, eval_data, model_name=trained_model_name, target_modules=target_modules)
    # Start the training process
    trainer.train_model()
    print("Model training is completed")
    del base_model, X_train_prompt
    return trainer

def load_model(base_model_name, data):
    clear_gpu()
    trained_model_name = "trained_model/"+data+"/"+base_model_name
    reloader = ModelReloader(base_model_name, trained_model_name)
    model, tokenizer = reloader.reload()
    return model, tokenizer

def evaluate_model(model, tokenizer, base_model_name, data,X_test_prompt,y_true):
    clear_gpu()
    # trained_model_name = "trained_model/"+data+"/"+base_model_name
    # reloader = ModelReloader(base_model_name, trained_model_name)
    # model, tokenizer = reloader.reload()
    labels=set(y_true)
    predictor = ModelPredictor(model, tokenizer, labels)
    y_pred = predictor.predict(X_test_prompt)
    evaluator = ModelEvaluator()
    y_true_label, y_pred_label=evaluator.evaluate(y_true, y_pred)
    print("Model evaluation is completed")
    errors=(y_true_label!= y_pred_label).astype(int)
    method_name=data+"/"+base_model_name
    os.makedirs("result/"+method_name, exist_ok=True)
    error_df=pd.DataFrame({method_name:errors})
    error_df.to_csv("result/"+method_name+"/errors.csv", index=False)
    del model, X_test_prompt
    return y_pred


# def load_test_loss(data_type, version=0):
#     test_error=pd.read_csv( "test_loss/sklearn_models_"+ data_type +"_"+ str(version)+ ".csv")
#     test_error_nn=pd.read_csv( "test_loss/nn_models_"+ data_type + "_"+ str(version)+".csv")
#     return pd.concat([test_error, test_error_nn], axis=1)

## Preprocess data

In [0]:
# target="review_sentiment"
# data="fancyzhx/amazon_polarity"
# clear_gpu()
#Preprocess_data(data,local=False, label_col="label", text_col="content", target=target)

In [0]:
target="sentiment"
data="takala/financial_phrasebank"
#Preprocess_data(data,local=False, label_col="label", text_col="sentence", target=target, version='sentences_50agree')

In [0]:
target="finance_sentiment"
data="zeroshot/twitter-financial-news-sentiment"
#Preprocess_data(data,local=False, label_col="label", text_col="text", target=target)

In [0]:
target="emotion6"
data="AdamCodd/emotion-balanced"
#Preprocess_data(data,local=False, label_col="label", text_col="text", target=target)

In [0]:
target="emotion7"
data="ma2za/many_emotions"
#Preprocess_data(data,local=False, label_col="label", text_col="text", target=target,version="raw",split="en")

In [0]:
target="news_class"
data="fancyzhx/ag_news"
#Preprocess_data(data,local=False, label_col="label", text_col="text", target=target)

In [0]:
X_train_df, X_test_df, X_eval_df=read_dataset(data)
y_true = X_test_df[target].astype(str)
X_train_prompt, X_test_prompt, X_eval_prompt=training_prompt(X_train_df, X_test_df, X_eval_df)

## Read dataset and modeling

In [0]:
X_train_prompt["text"][101]

In [0]:
# #test LLM for a single prompt
# clear_gpu()
# base_model_name= "NousResearch/Llama-2-7b-hf"
# base_model=AutoModelForCausalLM.from_pretrained(
#             base_model_name,
#             torch_dtype='auto',  # Set torch dtype to 'auto' for automatic handling
#             device_map="auto",  # Automatic device mapping for optimal placement
#             # quantization_config=bnb_config,  # Uncomment and adjust if quantization is needed
#         )
# tokenizer = AutoTokenizer.from_pretrained(
#             base_model_name,
#             trust_remote_code=True,  # Enable loading custom/remote tokenizers
#         )
# tokenizer.pad_token = tokenizer.eos_token  # Set pad token
# tokenizer.padding_side = "right" 
# pipe=pipeline(task="text-generation", 
#                              model=base_model, 
#                              tokenizer=tokenizer, 
#                              max_new_tokens=6,  # Number of tokens to generate
#                              temperature=0.001   # Sampling temperature
#                             )
# prompt=X_test_prompt["text"][277]
# result = pipe(prompt)
# answer = result[0]['generated_text'].split("] =")[1].strip()
# answer

In [0]:
# Train model
clear_gpu()
base_model_name = "NousResearch/Llama-2-7b-hf"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
y_pred=evaluate_model(trainer.model,trainer.tokenizer, base_model_name, data,X_test_prompt,y_true)

In [0]:
# Train model
clear_gpu()
del trainer
base_model_name = "mistralai/Mistral-7B-v0.3"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
y_pred=evaluate_model(trainer.model,trainer.tokenizer, base_model_name, data,X_test_prompt,y_true)

In [0]:
# Train model
clear_gpu()
del trainer
base_model_name = "bigscience/bloom-7b1"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
#y_true_label, y_pred_label=evaluate_model(base_model_name, data,X_test_prompt,y_true)
y_pred=evaluate_model(trainer.model,trainer.tokenizer, base_model_name, data,X_test_prompt,y_true)

In [0]:
# Train model
clear_gpu()
del trainer
base_model_name = "tiiuae/falcon-7b"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
y_pred=evaluate_model(trainer.model,trainer.tokenizer, base_model_name, data,X_test_prompt,y_true)

In [0]:
# Train model
clear_gpu()
del trainer
base_model_name = "meta-llama/Meta-Llama-3-8B"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
y_pred=evaluate_model(trainer.model,trainer.tokenizer, base_model_name, data,X_test_prompt,y_true)

In [0]:
# Train model
clear_gpu()
del trainer
base_model_name = "CohereForAI/aya-23-8B"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
y_pred=evaluate_model(trainer.model,trainer.tokenizer,base_model_name, data,X_test_prompt,y_true)

In [0]:
clear_gpu()
del trainer
base_model_name = "Qwen/Qwen2-7B"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
y_pred=evaluate_model(trainer.model,trainer.tokenizer,base_model_name, data,X_test_prompt,y_true)

In [0]:
clear_gpu()
del trainer
base_model_name = "microsoft/phi-2"
trainer=fine_tune(base_model_name, data, X_train_prompt, X_eval_prompt)
#Evaluate model on test set
y_pred=evaluate_model(trainer.model,trainer.tokenizer,base_model_name, data,X_test_prompt,y_true)