Link to follow for this : https://www.mlexpert.io/prompt-engineering/chatbot-with-local-llm-using-langchain#try-the-**model**

#Install and load packages

In [1]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq bitsandbytes==0.40.0 --progress-bar off
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq transformers==4.30.0 --progress-bar off
!pip install -qqq accelerate==0.21.0 --progress-bar off
!pip install -qqq xformers==0.0.20 --progress-bar off
!pip install -qqq einops==0.6.1 --progress-bar off
!pip install -qqq langchain==0.0.233 --progress-bar off

[0m

###Hugging face login

In [None]:
from huggingface_hub import notebook_login
notebook_login()

#Load model and start server

###Imports

In [None]:
import re
import warnings
from typing import List

import torch
from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.llms import HuggingFacePipeline
from langchain.schema import BaseOutputParser
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,
)

warnings.filterwarnings("ignore", category=UserWarning)

!pip install peft
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)

### Load fine tuned model with qLoRA weights in 4 bit config

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


PEFT_MODEL = "arvind2626/f6"
config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)

###Generation config used by the model

In [4]:
generation_config = model.generation_config
generation_config.max_new_tokens = 900
generation_config.temperature = 0
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

###Function to stop generation on specific tags

In [5]:
class StopGenerationCriteria(StoppingCriteria):
    def __init__(
        self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device
    ):
        stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
        self.stop_token_ids = [
            torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids
        ]

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        for stop_ids in self.stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False

In [6]:
stop_tokens = [["Human", ":"], ["AI", ":"]]
stopping_criteria = StoppingCriteriaList(
    [StopGenerationCriteria(stop_tokens, tokenizer, model.device)]
)

###Initialize generation pipeline

In [None]:
generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task="text-generation",
    stopping_criteria=stopping_criteria,
    generation_config=generation_config,
)

llm = HuggingFacePipeline(pipeline=generation_pipeline)

###Clean the final output

In [8]:
class CleanupOutputParser(BaseOutputParser):
    def parse(self, text: str) -> str:
        user_pattern = r"\nUser"
        text = re.sub(user_pattern, "", text)
        human_pattern = r"\nHuman:"
        text = re.sub(human_pattern, "", text)
        ai_pattern = r"\AI:"
        return re.sub(ai_pattern, "", text).strip()

    @property
    def _type(self) -> str:
        return "output_parser"

###Template given to AI chatbot

In [9]:
purchase_history = "(Blue Denim Jacket, White Graphic T-Shirt, Red Plaid Flannel Shirt, Gray Hooded Sweatshirt, Navy Blue Chinos, Striped Polo Shirt, Brown Leather Belt, Floral Print Sundress)"
# Will fetch trends generated from scraper here
trends = "(Chunky Sneakers, Oversized Blazers, High-Waisted Jeans, Bucket Hats, Midi Dresses, Graphic Tees, Sustainable Fashion Pieces, Statement Earrings, Fanny Packs (Belt Bags), Wrap Dresses)"

In [10]:
template = """
System: This is a conversation between a human and an outfit recommendation AI. The AI does a conversation with the user to get to know their various details such as age, height, location, ocassion, preferences. Based on these details it suggests an outfit to the human.

This is a list of past purchases of the human
"""+purchase_history+"""\n

This is a list of trending outfits
"""+trends+"""\n
The AI considers the human's purchase history and current trending outfits and can suggest outfits which might be similar. Outfits need not necessarily belong to these.


The AI must always suggest 3 outfits in collections of topwear, bottomwear, shoes, accesories in the following format:
Outfit 1:
Top: Name of cloth: short description
Bottom: Name of cloth: short description
Footwear:...
Accesories:...

Outfit 2:
Top: ...
Bottom: ...
and so on

The AI does not suggest brands or price. It only suggest different types of clothes

The AI doesn't immediately suggest an outfit. It does a conversation/chat with the human for getting various details. It suggests the outfit only when the Human asks so. It suggests outfit strictly in the pattern mentioned above.


Current conversation:

{history}

Human: {input}
AI:
""".strip()

prompt = PromptTemplate(input_variables=["history", "input"], template=template)

###Start the Langchain, Conversation Beffer Window Memory

In [13]:
memory = ConversationBufferWindowMemory(
    memory_key="history", k=6, return_only_outputs=True
)

chain = ConversationChain(
    llm=llm,
    memory=memory,
    prompt=prompt,
    output_parser=CleanupOutputParser(),
    verbose=True,
)

In [None]:
while 1:
  text = input()
  res = chain(text)
  print(res["response"])