In [None]:

!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [100]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192
dtype = None
load_in_4bit = True 


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
from datasets import load_dataset

dataset = load_dataset('ibm/finqa')
print(dataset)

## Loading model

In [None]:
!unzip model2.zip -d /content/model2


In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/DL_Models/model2", 
        max_seq_length = 8192,
        dtype = None,
        load_in_4bit = True,
    )
    FastLanguageModel.for_inference(model) 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
test_dataset = dataset['test']
print(len(test_dataset))

In [5]:
prompt = """You are an expert in the financial domain tasked with solving a given financial problem. Follow these steps:

1. Read the question carefully.
2. Provide the formula for the given question.
3. Use the formula to caculate the final answer.


### Pretext:
{}

### Posttext:
{}

### Table:
{}

### Question:
{}

### Output:
{}"""

In [None]:
final_response = []
for i in range(len(test_dataset)):
  FastLanguageModel.for_inference(model)
  pre_text = test_dataset[i]["pre_text"]
  post_text = test_dataset[i]["post_text"]
  table = test_dataset[i]["table"]
  question = test_dataset[i]["question"]
  input_prompt = prompt.format(
          pre_text,
          post_text,
          table,
          question,
          #"",
          # program_re,
          # gold_inds,
          "", 
      )
  inputs = tokenizer(
  [
      input_prompt
  ], return_tensors = "pt").to("cuda")

  input_shape = inputs['input_ids'].shape
  input_token_len = input_shape[1]
  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

  response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
  final_response.append(response[0])
  if i%100 == 0:
    print(i)

In [None]:
import re

def parse_number(string):
    """
    Extracts numerical values (integers, floats, or percentages) from a string.
    """
    try:
        clean_string = re.sub(r'[^\d.%\-]', '', string)

        if clean_string.endswith('%'):
            return float(clean_string.strip('%')) / 100  
        return float(clean_string) 
    except ValueError:
        return None  

count = 0
tolerance = 0.00  

for i in range(len(test_dataset)):
    answer = test_dataset[i]["answer"]
    predicted = final_response[i]

    answer_value = parse_number(answer)
    predicted_value = parse_number(predicted)

    if answer_value is not None and predicted_value is not None:
        if abs(answer_value - predicted_value) <= tolerance * abs(answer_value):
            count += 1
    else:
        if predicted.strip().lower() == answer.strip().lower():
            count += 1

print(f"Correct predictions within margin: {count}")


In [None]:
print(count/len(test_dataset)*100)

## Few Shot Prompting

In [None]:
samples_prompt = """These are the examples:

### Pretext:
{}

### Posttext:
{}

### Table:
{}

### Question:
{}

### Answer:
{}

### Final_result:
{}

### Program_re:
{}

### GoldInds:
{}
"""
import numpy as np
temp = np.array(dataset['train'])
indices = np.random.choice(len(temp),5,replace=False)
print(indices)

EOS_TOKEN = tokenizer.eos_token 

pre_text1 = []
post_text1 = []
table1 = []
question1 = []
answer1 = []
final_result1 = []
program_re1 = []
gold_inds1 = []
for i in indices:
  pre_text1.append(dataset['train']["pre_text"][int(i)])
  post_text1.append(dataset['train']["post_text"][int(i)])
  table1.append(dataset['train']["table"][int(i)])
  question1.append(dataset['train']["question"][int(i)])
  answer1.append(dataset['train']["answer"][int(i)])
  final_result1.append(dataset['train']["final_result"][int(i)])
  program_re1.append(dataset['train']["program_re"][int(i)])
  gold_inds1.append(dataset['train']["gold_inds"][int(i)])
eg = []

for a,b,c,d,e,f,g,h in zip(pre_text1,post_text1, table1, question1, answer1, final_result1, program_re1, gold_inds1):
    sample = samples_prompt.format(a,b,c,d,e,f,g,h) + EOS_TOKEN
    eg.append(sample)

In [10]:
final_prompt = """You are an expert in the financial domain tasked with solving a given financial problem. Follow these steps:

1. Look at the examples to see how similar problems are solved.
2. Read the question carefully.
3. Provide the formula for the given question.
4. Use the formula to caculate the final answer.

### Examples:
{}

### Pretext:
{}

### Posttext:
{}

### Table:
{}

### Question:
{}

### GoldInds:
{}

### Output:
{}"""


EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func2(examples):

    pre_text = examples["pre_text"]
    post_text = examples["post_text"]
    table = examples["table"]
    question = examples["question"]
    answer = examples["answer"]
    final_result = examples["final_result"]
    program_re = examples["program_re"]
    gold_inds = examples["gold_inds"]
    output = examples["answer"]

    texts = []
    for a,b,c,d,e,f,g,h in zip(pre_text,post_text, table, question, answer, program_re, gold_inds, output):
        text = final_prompt.format(eg,a,b,c,d,g,h) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


In [None]:
final_response2 = []
for i in range(len(test_dataset)):
  FastLanguageModel.for_inference(model)
  pre_text = test_dataset[i]["pre_text"]
  post_text = test_dataset[i]["post_text"]
  table = test_dataset[i]["table"]
  question = test_dataset[i]["question"]
  answer = ""
  program_re = ""
  gold_inds = ""
  input_prompt = final_prompt.format(
          eg,
          pre_text,
          post_text,
          table,
          question,
          answer, 
          program_re,
          gold_inds,
          "", 
      )
  inputs = tokenizer(
  [
      input_prompt
  ], return_tensors = "pt").to("cuda")

  input_shape = inputs['input_ids'].shape
  input_token_len = input_shape[1] 
  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

  response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
  final_response2.append(response[0])
  if i%100 == 0:
    print(i)

In [None]:

import re

def parse_number(string):
    """
    Extracts numerical values (integers, floats, or percentages) from a string.
    """
    try:
        clean_string = re.sub(r'[^\d.%\-]', '', string)

        if clean_string.endswith('%'):
            return float(clean_string.strip('%')) / 100 
        return float(clean_string) 
    except ValueError:
        return None  

count = 0
tolerance = 0.05  

for i in range(len(test_dataset)):
    answer = test_dataset[i]["answer"]
    predicted = final_response2[i]

    answer_value = parse_number(answer)
    predicted_value = parse_number(predicted)

    if answer_value is not None and predicted_value is not None:

        if abs(answer_value - predicted_value) <= tolerance * abs(answer_value):
            count += 1
    else:
        if predicted.strip().lower() == answer.strip().lower():
            count += 1

print(f"Correct predictions within margin: {count}")


In [None]:
print(count/len(test_dataset)*100)