In [None]:
import pandas as pd
import torch
import json
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

In [None]:
# Loading bloomz model and tokenizer
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m",device_map= "auto").to("cuda")

In [None]:
# Loading dataset prompts.json

dataset = load_dataset("json", data_files="prompts.json")

def prepare_train_data(data):
    # prompt + completion
    text_input = data['text']
    text_input = [statement + ' </s>' for statement in text_input]

    # tokenize the input (prompt + completion) text
    tokenized_input = tokenizer(text_input, return_tensors='pt', padding=True, truncation=True, max_length=256)
    # generative models: labels are the same as the input
    tokenized_input['labels'] = tokenized_input['input_ids']
    return tokenized_input


In [None]:
train_dataset = dataset['train'].map(prepare_train_data,
                                     batched=True,
                                     remove_columns=["text"])

In [None]:
# setting arguments to be used during training
training_arguments = TrainingArguments(
    'Clone',
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset = train_dataset
)

trainer.train()
trainer.save_model()

# Testing

In [None]:
import torch
from transformers import pipeline
from transformers import BloomTokenizerFast, BloomForCausalLM
import textwrap

# Loading the fine-tuned model
tokenizer = BloomTokenizerFast.from_pretrained("Clone")
model = BloomForCausalLM.from_pretrained("Clone")



In [None]:
import textwrap
from transformers import pipeline

generator = pipeline('text-generation',
                         model=model,
                         tokenizer=tokenizer,
                         do_sample=True,
                         temperature=1)

while True:
    # Take user input
    user_input = input("You: ")

    # Exit the loop if the user types 'exit'
    if user_input.lower() == 'exit':
        print("Conversation ended.")
        break

    # Generate model response
    prompt = f"Question: {user_input}\nWith no context, what is the answer?\nAnswer: "

    result = generator(prompt, max_length=256)

    generated_text = result[0]['generated_text']

    # Find the position of the end-of-sequence token (EOS)
    eos_index = generated_text.find("</s>")
    if eos_index != -1:
        # Crop the generated text up to the end-of-sequence token
        generated_answer = generated_text[len(prompt):eos_index].strip()
    else:
        # If EOS token not found, use the whole generated text
        generated_answer = generated_text[len(prompt):].strip()

    # Extract the generated answer
    generated_answer = generated_text[len(prompt):].strip()

    # Wrap the generated answer to fit the screen width
    wrapped_answer = textwrap.fill(generated_answer, width=120)

    # Print the wrapped answer
    print(f"Answer:\n{wrapped_answer}\n")
