In [7]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import pandas as pd
import requests 
from bs4 import BeautifulSoup 
   
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

bin C:\Users\cdoy6\miniconda3\envs\llms\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118_nocublaslt.dll


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()



In [4]:
flush()
torch.cuda.max_memory_allocated()

0

## Scraping FAQs from Fetch Website

In [6]:
URL = "https://fetch.com/faq#Receipts"
r = requests.get(URL) 
   
soup = BeautifulSoup(r.content,  "lxml") 
   
   
table = soup.find_all("div", attrs={"class":"pt-4"})

In [7]:
soup = BeautifulSoup(r.content, 'html.parser')
for k in (soup.find_all(style_=False)):
    if "mainEntity" in k.text:
        # print(k.text)
        
        dict = json.loads(k.text, strict=False)
        break

In [8]:
new_dict = {"questions":[], "answers":[]}
for i in dict["mainEntity"]:
    new_dict["questions"].append(i["name"])
    new_dict["answers"].append(i["acceptedAnswer"]['text'])

In [None]:
new_dict["questions"].append("Is Fetch safe?")
new_dict["questions"].append("Who is Wes on Fetch?")
new_dict["answers"].append("Fetch is 100% safe to use. All you are doing with Fetch is getting rewarded for the shopping you are already doing, no credit card is needed, just sign up with a phone number and you’re good to start earning free gift cards!")
new_dict["answers"].append("Wes Schroll is the founder and CEO of Fetch, and he is everyone's first friend on the Social tab of the Fetch app! If you want to learn more about Wes, follow his blogs, his Instagram, or his Twitter! Fetch is 100% safe to use.")

In [None]:
json_format = pd.DataFrame(new_dict).to_dict(orient="records")

with open("fetch_dataset.json", "w") as f:
    json.dump(json_format, f)

## Loading open-source Falcon LLM

In [6]:
# MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
MODEL_NAME = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config, cache_dir='E:\code_projects\cache'
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir='E:\code_projects\cache')
tokenizer.pad_token = tokenizer.eos_token



Loading checkpoint shards:   0%|                                                                 | 0/2 [00:07<?, ?it/s]

KeyboardInterrupt



### Applying Lora for optimized fine-tuning

In [12]:
model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=False)

In [13]:
# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["query_key_value"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = ["query_key_value"]
# lora_target_modules = [
#     "q_proj",
#     "up_proj",
#     "o_proj",
#     "k_proj",
#     "down_proj",
#     "gate_proj",
#     "v_proj",
# ]


config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

## Finetuning LLM using Fetch FAQs

In [14]:
def generate_prompt(data_point):
    return f"""
<human>: {data_point["questions"]}
<assistant>: {data_point["answers"]}
""".strip()


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

In [15]:
data = load_dataset("json", data_files="fetch_dataset.json")
data = data["train"].shuffle().map(generate_and_tokenize_prompt)

Downloading data files: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 998.64it/s]
Extracting data files: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.46it/s]
Generating train split: 25 examples [00:00, 595.28 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 352.40 examples/s]


In [16]:
OUTPUT_DIR = "experiments"

In [17]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=100,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine"
    # warmup_ratio=0.05,
    # report_to="tensorboard",
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.1846
2,2.5666
3,2.6094
4,2.6458
5,2.3176
6,2.6907
7,2.8581
8,2.2147
9,1.9168
10,2.3114


TrainOutput(global_step=100, training_loss=0.9486976008489728, metrics={'train_runtime': 431.8318, 'train_samples_per_second': 0.926, 'train_steps_per_second': 0.232, 'total_flos': 1804684860518400.0, 'train_loss': 0.9486976008489728, 'epoch': 16.0})

In [10]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [12]:
DEVICE = "cuda:0"

In [9]:
def generate_response(question: str) -> str:
    prompt = f"""
            <human>: {question}
            <assistant>:
            """.strip()
    encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_start = "<assistant>:"
    response_start = response.find(assistant_start)
    return response[response_start + len(assistant_start) :].strip()

## Uploading model to huggingface and then load for inferencing

In [None]:
prompt = "how do i download get fetch"
print(generate_response(prompt))

In [22]:
model.save_pretrained("trained-model")

In [23]:
model.push_to_hub(
    "cdy3870/Falcon-Fetch-Bot", use_auth_token=True
)

adapter_model.safetensors: 100%|██████████████████████████████████████████████████| 18.9M/18.9M [00:05<00:00, 3.76MB/s]


CommitInfo(commit_url='https://huggingface.co/cdy3870/Falcon-Fetch-Bot/commit/b60ad933f3506e36674c42526135fb52adbb362b', commit_message='Upload model', commit_description='', oid='b60ad933f3506e36674c42526135fb52adbb362b', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

PEFT_MODEL = "cdy3870/Falcon-Fetch-Bot"

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True, cache_dir='E:\code_projects\cache'
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, cache_dir='E:\code_projects\cache')
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL, cache_dir='E:\code_projects\cache')



Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [02:12<00:00, 66.03s/it]


In [20]:
prompt = "who is wes"
print(generate_response(prompt))



Wes Schroll is the founder and CEO of Fetch, and he is everyone's first friend on the Social tab of the Fetch app! If you want to learn more about Wes, follow his blogs, his Instagram, or his Twitter! Fetch is 100% safe to use. All the friend requests you receive through the app are from real people who want to be your friend!
             Fetch is 100% safe to use. All the friend requests you receive through the app are from real people who want to be your friend!
             All you need to do to protect yourself from fake friends is to verify your phone number, and you're good to start making new friends!
             Fetch is not a dating app;


In [9]:

import gradio as gr
import random
import time

generation_config = model.generation_config
generation_config.max_new_tokens = 150
generation_config.temperature = 0.6
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

with gr.Blocks() as demo:

    def update_temp(temp):
        generation_config.temperature = temp

    def update_tokens(tokens):
        generation_config.max_new_tokens = tokens
    
    chatbot = gr.Chatbot(label="Fetch Rewards Chatbot")
    temperature = gr.Slider(0, 1, value=0.6, step=0.1, label="Creativity", interactive=True)
    temperature.change(fn=update_temp, inputs=temperature)

    tokens = gr.Slider(50, 200, value=100, step=50, label="Length", interactive=True)
    tokens.change(fn=update_tokens, inputs=tokens)

    msg = gr.Textbox(label="", placeholder="Ask anything about Fetch!")
    clear = gr.Button("Clear Log")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        
        message = history[-1][0]
        prompt = f"""
        <human>: {message}
        <assistant>:
        """.strip()
        
        result = pipeline(
            prompt,
            generation_config=generation_config,
        )
        # print(result)
        parsed_result = result[0]["generated_text"].split("<assistant>:")[1][1:].split("\n")[0]
            
        history[-1][1] = ""
        for character in parsed_result:
            history[-1][1] += character
            time.sleep(0.01)
            yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)
    


In [10]:
demo.queue()
demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


