### Installing dependencies

In [None]:
!pip install xlsxwriter
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install GPUtil
!pip install rouge_score sentence_transformers

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
import re
import numpy as np
import copy
from tqdm import tqdm
from typing import Dict, Optional, Sequence, Tuple
import torch
from torch.utils.data import DataLoader
from transformers import (
  PreTrainedTokenizer,
  AutoModelForCausalLM,
  AutoTokenizer,
  BitsAndBytesConfig,
  AutoConfig,
  default_data_collator,
  DataCollatorForLanguageModeling,
  DataCollatorWithPadding,
  get_linear_schedule_with_warmup,
  TrainingArguments,
  Trainer,
  TrainerCallback
)
from peft import prepare_model_for_int8_training, prepare_model_for_kbit_training, get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, PrefixTuningConfig, LoraConfig, TaskType, PeftType, PeftConfig, PeftModel
from datasets import load_dataset, DatasetDict, Dataset
from GPUtil import showUtilization as gpu_usage

from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

import warnings
warnings.filterwarnings('ignore')

In [None]:
num_epochs = 2
batch_size = 2
lr = 0.001
rank = 8
max_length = 2000
data_collator = None

MODEL_PATH = "/content/drive/MyDrive/Stocks/raw_models/Llama-2-7b-chat-hf"
# MODEL_PATH = "/content/drive/MyDrive/Stocks/raw_models/dolly-v2-3b"
# MODEL_PATH = "gpt2-large"

LOCAL_OUTPUT_DIR = "/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/"
# LOCAL_OUTPUT_DIR = "/content/drive/MyDrive/Stocks/finetune_models/dolly-v2-3b/"
# LOCAL_OUTPUT_DIR = "/content/drive/MyDrive/Stocks/finetune_models/gpt2-large/"

LOCAL_OUTPUT_DIR = LOCAL_OUTPUT_DIR + f"ep{num_epochs}_bs{batch_size}_lr{lr}_rank{rank}_maxtoken{max_length}"

### Preparing training data

In [None]:
prompt_format = """<s>[INST] <<SYS>>
You are a stock market analyst working for a brokerage firm. You are going to help me in analyzing the corporate announcement document submitted to the Indian stock exchange by a company.
If you are not able to analyze, please don't share false information.
<</SYS>

Your task is to analyze the given context and generate a concise report as truthfully as possible by following the provided instructions.
### Instruction:
Extract important short points or keywords that can help me make a decision on whether to purchase or sell the stock of this company?
Based on your response can you also give me a one-liner sentiment(positive/neutral/negative) and a short and crisp conclusion on whether the stock price of the company will go upside or downside?
You have to follow the below format while generating the report.
```
### Key points: (mention the only key points here)

### Sentiment: (mention the sentiment here)

### Conclusion: (mention the final conclusion here)
```

### Context:
{input}

[/INST]
### Report:"""

In [None]:
# prompt_format = """You are a stock market analyst working for a brokerage firm. You are going to help me in analyzing the corporate announcement document submitted to the Indian stock exchange by a company.
# If you are not able to analyze, please don't share false information.

# Your task is to analyze the given context and generate a concise report as truthfully as possible by following the provided instructions.
# ### Instruction:
# Extract important short points or keywords that can help me make a decision on whether to purchase or sell the stock of this company?
# Based on your response can you also give me a one-liner sentiment(positive/neutral/negative) and a short and crisp conclusion on whether the stock price of the company will go upside or downside?
# You have to follow the below format while generating the report.
# ```
# ### Key points: (mention the only key points here)

# ### Sentiment: (mention the sentiment here)

# ### Conclusion: (mention the final conclusion here)
# ```

# ### Context:
# {input}

# ### Report:"""

In [None]:
IGNORE_INDEX = -100

def _preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: PreTrainedTokenizer,
    max_length: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Preprocess the data by tokenizing."""
    sequences = [s + t for s, t in zip(sources, targets)]
    sequences_token = tokenizer(
        sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    sources_token = tokenizer(
        sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )

    assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
    labels = copy.deepcopy(sequences_token["input_ids"])
    for i in range(labels.shape[0]):
        source_len = sources_token["attention_mask"][i].sum().item()
        pad_len = max_length - sequences_token["attention_mask"][i].sum().item()
        if tokenizer.padding_side == "right":
            # |prompt|completion|eos|pad|
            labels[i][:source_len] = IGNORE_INDEX
            if pad_len>0:
              labels[i][-pad_len:] = IGNORE_INDEX
        elif tokenizer.padding_side == "left":
            # |pad|prompt|completion|eos|
            labels[i][: pad_len + source_len] = IGNORE_INDEX
        else:
            raise RuntimeError()

    return sequences_token["input_ids"], labels, sequences_token["attention_mask"]

def SupervisedDataset(
  data: pd.DataFrame,
  tokenizer: PreTrainedTokenizer,
  max_length: int = 512,
):
  context_ls = list(data['pdf_extracted_data'])
  response_ls = list(data['chatgpt_prediction'])

  sources = [
      prompt_format.format(input=context) for context in context_ls
  ]
  targets = [response + tokenizer.eos_token for response in response_ls]

  input_ids, labels, attention_mask = _preprocess(sources, targets, tokenizer, max_length)

  return dict(input_ids=input_ids, labels=labels, attention_mask=attention_mask)

In [None]:
train_df = pd.read_excel("/content/drive/MyDrive/Stocks/web_scraping/model_data/v1/train_data.xlsx")
val_df = pd.read_excel("/content/drive/MyDrive/Stocks/web_scraping/model_data/v1/val_data.xlsx")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, device_map='auto')
tokenizer.pad_token = tokenizer.eos_token

train_dataset = SupervisedDataset(
  data=train_df,
  tokenizer=tokenizer,
  max_length=max_length,
)
train_dataset = pd.DataFrame({'input_ids': list(train_dataset['input_ids']), 'labels': list(train_dataset['labels']), 'attention_mask': list(train_dataset['attention_mask'])})
train_dataset['tmp'] = train_dataset['labels'].apply(lambda a: 1 if len([i for i in a if i != -100])==0 else 0)
train_dataset = train_dataset[train_dataset['tmp']==0].reset_index(drop=True)
train_dataset = train_dataset.drop(['tmp'], axis=1)
train_dataset = Dataset.from_dict(train_dataset)

eval_dataset = SupervisedDataset(
  data=val_df,
  tokenizer=tokenizer,
  max_length=max_length,
)
eval_dataset = pd.DataFrame({'input_ids': list(eval_dataset['input_ids']), 'labels': list(eval_dataset['labels']), 'attention_mask': list(eval_dataset['attention_mask'])})
eval_dataset['tmp'] = eval_dataset['labels'].apply(lambda a: 1 if len([i for i in a if i != -100])==0 else 0)
eval_dataset = eval_dataset[eval_dataset['tmp']==0].reset_index(drop=True)
eval_dataset = eval_dataset.drop(['tmp'], axis=1)
eval_dataset = Dataset.from_dict(eval_dataset)

### Training

In [None]:
def load_peft_lora_model(model):
  peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=['q_proj', 'v_proj'],
    bias='none',
    r=rank,
    lora_alpha=32,
    lora_dropout=0.01
  )

  model = get_peft_model(model, peft_config)
  model.print_trainable_parameters()
  return model

def load_model(pretrained_model_name_or_path: str) -> AutoModelForCausalLM:
  print(f"Loading model for {pretrained_model_name_or_path}")
  # model = AutoModelForCausalLM.from_pretrained(
  #   pretrained_model_name_or_path,
  #   device_map='auto'
  # )

  # model = AutoModelForCausalLM.from_pretrained(
  #   pretrained_model_name_or_path,
  #   load_in_4bit=True,
  #   device_map='auto'
  # )

  model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path,
    quantization_config=BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.float16,
      bnb_4bit_use_double_quant=False
    ),
    device_map='auto'
  )

  config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
  model_hidden_size = config.hidden_size

  model = prepare_model_for_kbit_training(model)
  model = load_peft_lora_model(model)

  return model, model_hidden_size

In [None]:
model, model_hidden_size = load_model(MODEL_PATH)

In [None]:
def fine_tune_model(
  *,
  local_rank: str = None,
  local_output_dir: str = LOCAL_OUTPUT_DIR,
  dbfs_output_dir: str = None,
  epochs: int = num_epochs,
  per_device_train_batch_size: int = batch_size,
  per_device_eval_batch_size: int = batch_size,
  lr: float = lr,
  gradient_checkpointing: bool = False,
  gradient_accumulation_steps: int = 6,
  fp16: bool = False,
  bf16: bool = False,
  max_steps: int = 200,
  save_steps: int = 4,
  logging_steps: int = 4,
  eval_steps: int = 4,
  save_total_limit: int = 20,
  warmup_steps: int = 8
):
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

  training_args = TrainingArguments(
    output_dir=local_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_checkpointing=gradient_checkpointing,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=lr,
    num_train_epochs=epochs,
    weight_decay=1,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    fp16=fp16,
    bf16=bf16,
    logging_strategy="steps",
    logging_steps=logging_steps,
    save_strategy="steps",
    save_steps=save_steps,
    # max_steps=max_steps,
    save_total_limit=save_total_limit,
    local_rank=local_rank,
    warmup_steps=warmup_steps,
    report_to=[],
    logging_dir=local_output_dir,
    remove_unused_columns=False,
    label_names=["labels"],
    ddp_find_unused_parameters=False
  )

  trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
  )

  print("Training the model")
  train_results = trainer.train()
  trainer.log_metrics("train", train_results.metrics)
  trainer.save_metrics("train", train_results.metrics)

  val_results = trainer.evaluate()
  print(val_results)

  print(gpu_usage())

  print(f"Saving Model to {local_output_dir}")
  trainer.save_model(output_dir=local_output_dir)
  tokenizer.save_pretrained(local_output_dir)

  print("Training finished.")

In [None]:
gpu_usage()
torch.cuda.empty_cache()

In [None]:
fine_tune_model()

### Test prediction

In [None]:
def greedy_search_response(input_text):
  try:
    with torch.no_grad():
      inputs = tokenizer(input_text, return_tensors="pt")

      # Greedy Search
      outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], do_sample=False, num_beams=1, max_new_tokens=400)
      return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0].split("Report:")[1].strip()
  except Exception as e:
    print(f"Error: {e}")
    return ""
  
def top_p_sampling_response(input_text):
  try:
    with torch.no_grad():
      inputs = tokenizer(input_text, return_tensors="pt")

      # top-p sampling
      outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
                               max_new_tokens=400,
                               do_sample=True,
                               top_p=0.75,
                               top_k=0,
                               temperature=0.2,
                               num_return_sequences = 1,
                               no_repeat_ngram_size=2)
      return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0].split("Report:")[1].strip()
  except Exception as e:
    print(f"Error: {e}")
    return ""

In [None]:
test_df = pd.read_excel("/content/drive/MyDrive/Stocks/web_scraping/model_data/v1/test_data.xlsx")

In [None]:
def func_to_extract_sentiment_label(text):
  sentiment_match = re.search(r'Sentiment:\s*([A-Za-z]+)', text)

  if sentiment_match:
      sentiment = sentiment_match.group(1)
  else:
      sentiment = "Sentiment not found"

  return sentiment

In [None]:
model_summary = []
ct = 1
for i in list(test_df['pdf_extracted_data']):
  if ct%10 == 0:
    print(f"Running iteration no {ct}")

  if pd.isnull(i) == True:
    model_summary.append(None)
  else:
    complete_prompt = prompt_format.format(input=i)
    res = greedy_search_response(complete_prompt)
    model_summary.append(res)

  if ct%20 == 0:
    tmp_data = test_df.iloc[:ct].copy()
    tmp_data['model_summary'] = model_summary
    tmp_data['sentiment'] = tmp_data['model_summary'].apply(lambda a: func_to_extract_sentiment_label(a))
    tmp_data.to_excel(f"/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/ep1.5_bs2_lr0.001_rank8_maxtoken2000/checkpoints/greedy_test_prediction_after_{ct}.xlsx", index=False)

  ct = ct + 1

test_df['model_summary'] = model_summary
test_df['sentiment'] = test_df['model_summary'].apply(lambda a: func_to_extract_sentiment_label(a))

In [None]:
test_df.to_excel("/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/ep1.5_bs2_lr0.001_rank8_maxtoken2000/greedy_test_prediction.xlsx", index=False)

In [None]:
def get_sentence_similarity(reference = '', generated = '',  model_name = None):
  '''
  Generate cosine similarity score based on embeddings of two strings
  Parameters:
    reference (str) : Reference string to check similarity
    generated (str) : Generated/Target string to check similarity
    model_name (str) : Sentence tranformer model names
  Returns:
    Similarity score (float) : Cosine similarity score based on embeddings of the two strings
  '''
  if model_name == None:
    model = SentenceTransformer('all-minilm-l6-v2')
  else:
    model = SentenceTransformer(model_name)

  # convert to embeddings
  embedding1 = model.encode(reference, convert_to_tensor=True)
  embedding2 = model.encode(generated, convert_to_tensor=True)

  # compute similarity scores of two embeddings
  cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

  return cosine_scores.item()

def get_bleu_score(reference, candidate):
  '''
  Function to get BLEU scores for two strings
  '''
  candidate_ = candidate.split()
  reference_ = []
  reference_.append(reference.split())
  return sentence_bleu(reference_, candidate_, weights=(1, 0, 0, 0))

def get_evaluation_metrics(actuals, predicted):
  '''
  Generate benchamrking scores on different metrics for generated text

  Parameters:
    actuals (str | list) : Actual text or reference
    predicted (str | list) : Generated text or predictions
  Returns:
    blue_score (float) : Mean BLUE score
    rouge1 (float): Mean ROUGE1 score
    rougeL (float): Mean ROUGEL score
    sentence similarity (float): Mean Cosine Similariy score on embeddedings
  '''
  if isinstance(actuals, list) and isinstance(predicted, list):
    df = pd.DataFrame({'actuals':actuals,'predicted':predicted})
  elif isinstance(actuals, str) and isinstance(predicted, str):
    df = pd.DataFrame({'actuals':[actuals],'predicted':[predicted]})

  scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

  df['blue_score'] = df.apply(lambda x: get_bleu_score(x['actuals'], x['predicted']), axis = 1)
  df['rougue1'] = df.apply(lambda x: scorer.score(x['actuals'], x['predicted'])['rouge1'].fmeasure, axis = 1)
  df['rougeL'] = df.apply(lambda x: scorer.score(x['actuals'], x['predicted'])['rougeL'].fmeasure, axis = 1)
  df['sentence_similarity'] = df.apply(lambda x: get_sentence_similarity(reference = x['actuals'], generated = x['predicted']), axis = 1)

  return df['blue_score'].mean(), df['rougue1'].mean(), df['rougeL'].mean(), df['sentence_similarity'].mean(),df

In [None]:
bleu,rouge_one,rouge_l,semantic,scores_df  =  get_evaluation_metrics(test_df['chatgpt_prediction'].tolist(), test_df['model_summary'].tolist())
scores_df['COMPANY NAME'] = test_df['COMPANY NAME']
scores_df['ATTACHMENT'] = test_df['ATTACHMENT']
scores_df['pdf_extracted_data'] = test_df['pdf_extracted_data']
scores_df = scores_df[['COMPANY NAME','ATTACHMENT','pdf_extracted_data','actuals','predicted','blue_score','rougue1','rougeL','sentence_similarity']]
print(bleu,rouge_one,rouge_l,semantic)

In [None]:
scores_df.to_excel("/content/drive/MyDrive/Stocks/finetune_models/Llama-2-7b-chat-hf/ep1.5_bs2_lr0.001_rank8_maxtoken2000/greedy_test_prediction_scores.xlsx", index=False)