In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import torch
import numpy as np
import random
import pandas as pd

# Data Preprocessing

In [None]:
# Access the 'train' split of the dataset
train_dataset = pd.read_csv("data.csv")

df=pd.DataFrame.from_dict(train_dataset)

In [None]:
df.info()

In [None]:
import pandas as pd
import json

def build_prompt_verbose(data):
    if data['context'] is None or data['context'] == "":
        prompt = f"Question: {data['question']}\nWith no context, what is the answer?\nAnswer: {data['answer']}\n"
    else:
        prompt = f"Question: {data['question']}\nIn the context of {data['context']}, what is the answer?\nAnswer: {data['answer']}\n"
    return {'text': prompt}

df['prompt'] = df.apply(buildprompt, axis=1)

result = df['prompt'].to_list()

# Save prompts to a JSON file
with open('prompts.json', 'w') as outfile:
    json.dump(result, outfile, ensure_ascii=False)


In [None]:
pd.set_option('display.max_colwidth', None)

# Model

In [None]:
import pandas as pd
import torch
import json
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer
model_name="bigscience/bloom-560m"
# Loading bloomz model and tokenizer
tokenizer = BloomTokenizerFast.from_pretrained(model_name)
model = BloomForCausalLM.from_pretrained(model_name,device_map= "auto").to("cuda")

In [None]:
# Loading dataset prompts.json built using de portuguese legalQA dataset
dataset = load_dataset("json", data_files="prompts.json")

# prepare the data for training
def prepare_train_data(data):
    # prompt + completion
    text_input = data['text']
    # tokenize the input (prompt + completion) text
    tokenized_input = tokenizer(text_input, return_tensors='pt', padding=True, truncation=True,max_length=256)
    # generative models: labels are the same as the input
    tokenized_input['labels'] = tokenized_input['input_ids']
    return tokenized_input

train_dataset = dataset['train'].map(prepare_train_data,
                                     batched=True,
                                     remove_columns=["text"])

In [None]:
# setting arguments to be used during training
training_arguments = TrainingArguments(
    'Clone',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=20,
    weight_decay=0.01,
    fp16=True,
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset = train_dataset
)

trainer.train()
trainer.save_model()

In [None]:
from google.colab import files
files.download("/content/model.zip")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r "/content/Clone" "/content/drive/MyDrive"

# Testing

In [None]:
# Access the 'train' split of the dataset
test_dataset = pd.read_csv("data.csv")


In [None]:
import torch
from transformers import pipeline
from transformers import BloomTokenizerFast, BloomForCausalLM
import textwrap

# Loading the fine-tuned model: LegalQA-bloom-560m
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("Clone")



In [None]:
while True:
    # Take user input
    user_input = input("You: ")

    # Exit the loop if the user types 'exit'
    if user_input.lower() == 'exit':
        print("Conversation ended.")
        break


    # Generate model response
    prompt = f"Given the question {user_input}, what is the answer? Answer: "
    generator = pipeline('text-generation',
                          model=model,
                          tokenizer=tokenizer,
                          do_sample=True)
    result = generator(prompt, max_length=256)

    generated_answer = result[0]['generated_text'][len(prompt):].strip()

    # Wrap the generated answer to fit the screen width
    wrapped_answer = textwrap.fill(generated_answer, width=120)

    # Print the wrapped answer
    print(f"Answer:\n{wrapped_answer}\n")