Library installations and imports

In [None]:
%%capture
!pip install sentence_transformers
!pip install ctransformers
!pip install gradio
!pip install langchain_pinecone
!pip install pinecone-client
!pip install langchain_community
!pip install langchain
!pip install pinecone
!pip install sentence_transformers

In [None]:
from google.colab import drive
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor
import os
import pinecone
from pinecone import Pinecone
from transformers import AutoTokenizer, AutoModel
from pinecone import ServerlessSpec
import torch
import re
from transformers import AutoModelForCausalLM
import gradio as gr
from ctransformers import AutoModelForCausalLM, AutoConfig

Indexing RAG based data in Pinecone

In [None]:
#Connect google drive
drive.mount('/content/drive')


#File import
df = pd.read_csv('/content/drive/MyDrive/anime_data.csv')


#Pre-processing
# Convert 'Synopsis' and 'Genre' columns from list to paragraph
def list_to_paragraph(value):
    if isinstance(value, str):
        value = value.strip("[]").replace("', '", ", ").replace("'", "")
    return value

df['Synopsis'] = df['Synopsis'].apply(list_to_paragraph)
df['Genre'] = df['Genre'].apply(list_to_paragraph)


#Lowercase
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].str.lower()


#drop duplicates and extra columns
df.drop_duplicates(subset=['Synopsis'], inplace=True)
df = df.drop(['Rank', 'Popularity', 'Members'], axis=1)


#Fill null values with hyphen
df.fillna('-', inplace=True)
df.isnull().sum()


#Making it family friendly
df = df[df['Genre'].str.contains('hentai') == False]


#Concatenating into a single column with headers
df['combined'] = df.apply(lambda row: f"Title: {row['Title']}. Type: {row['Type']}. Status: {row['Status']}. Score: {row['Score']}. Studios: {row['Studios']}. Source: {row['Source']}. Genre: {row['Genre']}. Rated: {row['Rated']}. Synopsis: {row['Synopsis']}.", axis=1)


#Chunking the data
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
df['chunks'] = df['combined'].apply(lambda x: text_splitter.split_text(x))
chunks = [chunk for sublist in df['chunks'] for chunk in sublist]


# Using mpnet embedding with ThreadPoolExecutor
model = SentenceTransformer('all-mpnet-base-v2')
def embed_chunk(chunk):
    return model.encode(chunk, convert_to_tensor=True)
with ThreadPoolExecutor(max_workers=8) as executor:
    embeddings = list(executor.map(embed_chunk, chunks))


# Move tensors to CPU and convert to numpy arrays
embeddings = [embedding.cpu().numpy() for embedding in embeddings]


#Initializing Pinecone
pinecone_client = Pinecone(
    api_key='YOUR_API_KEY_HERE'
)
index_name = 'anime-index'
if index_name not in pinecone_client.list_indexes().names():
    pinecone_client.create_index(index_name,
                                 dimension=embeddings[0].shape[0],
                                 metric="cosine",
                                 spec=ServerlessSpec(
                                     cloud="aws",
                                     region="us-east-1")
    )
index = pinecone_client.Index(index_name)


#Creating Vector and Metadata to store
num_rows = len(df)
vectors = []
for i, embedding in enumerate(embeddings[:num_rows]):
    row = df.iloc[i]
    metadata = {
        "Title": row['Title'],
        "Type": row['Type'],
        "Status": row['Status'],
        "Score": row['Score'],
        "Studios": row['Studios'],
        "Source": row['Source'],
        "Genre": row['Genre'],
        "Rated": row['Rated'],
        "Synopsis": row['Synopsis']
    }
    vectors.append((str(i), embedding, metadata))


# Upsert embeddings with metadata in batches
batch_upsert(index, vectors)

LLM GENERATION

In [None]:
# Initialize Pinecone
def initialize_pinecone(api_key, index_name):
    pinecone_client = Pinecone(
        api_key=api_key
    )
    index = pinecone_client.Index(index_name)
    return index


# Load LLAMA Smart Query model
def load_query_refinement_model(model_path, model_file,device):
    config = AutoConfig.from_pretrained(model_path, context_length=4096,gpu_layers=50)
    config.device = device
    query_refinement_model = AutoModelForCausalLM.from_pretrained(
        model_path,
        model_file=model_file,
        model_type="llama",
        config=config,
        hf=False
    )
    return query_refinement_model


#Load LLAMA QA Model
def load_models(model_path, model_file, embedding_model_name, device):
    config = AutoConfig.from_pretrained(model_path, context_length=4096, gpu_layers=50)
    config.device = device
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        model_file=model_file,
        model_type="llama",
        config=config,
        hf=False
    )
    embedding_model = SentenceTransformer(embedding_model_name).to(device)
    return model, embedding_model


# Smart Query with LLaMA
def refine_query_with_llama(model, concatenated_query):
    sys_prompt = """
    Correct the grammar of the following query.

    Do not add any new information about the anime.

    Provide only the corrected query as a single line.

    Do not include starting text telling what you did.
    """
    prompt = f"<s>[INST] <<SYS>>\n{sys_prompt}\n<</SYS>>\n{concatenated_query} [/INST]"

    inputs = model(prompt, stream=True)
    refined_query = ""
    for response_part in inputs:
        refined_query += response_part

    return refined_query.strip()


# Embed query
def embed_query(query, embedding_model):
    return embedding_model.encode(query, convert_to_tensor=True).cpu().numpy().tolist()


# Search Pinecone
def search_pinecone(index, embedded_query, top_k=3):
    query_results = index.query(vector=embedded_query, top_k=top_k, include_metadata=True)
    return query_results['matches']


# Generate response with LLaMA
def generate_response_with_llama(model, conversation_history):
    sys_prompt = """
    You are an assistant that provides precise information (up to two lines) about anime based on user queries.
    You will receive 3 different contexts. Choose the one closest to the user query without mentioning the contexts in your response.
    """
    formatted_prompt = f"<s>[INST] <<SYS>>\n{sys_prompt}\n<</SYS>>\n"
    for i, (user_msg, assistant_msg) in enumerate(conversation_history):
        if i < len(conversation_history) - 1:
            formatted_prompt += f"{user_msg} [/INST] {assistant_msg} </s>\n<s>[INST] "
        else:
            formatted_prompt += f"{user_msg} [/INST] "
    formatted_prompt = formatted_prompt.strip()

    inputs = model(formatted_prompt, stream=True)
    return inputs


# Handle user query and maintain history
def handle_user_query(user_query, conversation_history, model, embedding_model, index,refinement_model, max_tokens=2000):
    # Check if conversation_history has at least 3 entries
    if len(conversation_history) >= 3:
      last_entries = conversation_history[-3:]
    else:
      last_entries = conversation_history
      #Add last response if it exists
    try:
      most_recent_entry_1 = conversation_history[-1][1]
    except IndexError:
      most_recent_entry_1 = ""
# Concatenated query
    concatenated_query = " ".join([entry[0] for entry in last_entries] + ([most_recent_entry_1] if most_recent_entry_1 else []) + [user_query])
    refined_query=refine_query_with_llama(refinement_model,concatenated_query)
    embedded_query = embed_query(refined_query, embedding_model)
    matches = search_pinecone(index, embedded_query)

    context_info = ""
    for match in matches:
        context_info += f"Title: {match['metadata']['Title']}. Type: {match['metadata']['Type']}. Status: {match['metadata']['Status']}. Score: {match['metadata']['Score']}. Studios: {match['metadata']['Studios']}. Source: {match['metadata']['Source']}. Genre: {match['metadata']['Genre']}. Rated: {match['metadata']['Rated']}. Synopsis: {match['metadata']['Synopsis']}.\n"

    assistant_msg = f"Context: {context_info}\nAssistant:"
    conversation_history.append((f"User: {user_query}", assistant_msg))

    total_tokens = sum(len(user_msg) + len(assistant_msg) for user_msg, assistant_msg in conversation_history) + 56
    if total_tokens > max_tokens:
        total_messages = len(conversation_history)
        first_20_percent_idx = max(1, int(total_messages * 0.2))
        last_50_percent_idx = max(1, int(total_messages * 0.5))
        # Calculate the range to keep
        keep_first_part = conversation_history[:first_20_percent_idx]
        keep_last_part = conversation_history[-last_50_percent_idx:]
        # Ensure the range does not overlap and covers complete messages
        if keep_first_part[-1] == keep_last_part[0]:
            keep_last_part = keep_last_part[1:]

        conversation_history = keep_first_part + keep_last_part

    return generate_response_with_llama(model, conversation_history)


# Gradio interface function
def gradio_interface(user_query, history):
    # Update conversation history with the user's query
    history.append((user_query, ""))
    # Generate the response using the model
    response_generator = handle_user_query(user_query, conversation_history, model, embedding_model, index, model1)
    # Initialize an empty string to accumulate the response
    accumulated_response = ""
    # Yield the response parts one by one
    for response_part in response_generator:
        accumulated_response += response_part  # Append the new part to the accumulated response
        # Update the last entry in the history with the current accumulated response
        history[-1] = (history[-1][0], accumulated_response)
        yield history, ""  # Yield the updated history
    if conversation_history:
        conversation_history[-1] = (conversation_history[-1][0], accumulated_response)
    else:
        conversation_history.append((f"User: {user_query}", accumulated_response))


#New Chat function
def clear_history():
    global conversation_history
    conversation_history = []
    return []


# Initialize
API_KEY = 'YOUR_PINECONE_API_KEY'
INDEX_NAME = 'anime-index'
MODEL_PATH = "TheBloke/Llama-2-7B-Chat-GGUF"
MODEL_FILE = "llama-2-7b-chat.Q5_K_M.gguf"
EMBEDDING_MODEL_NAME = 'all-mpnet-base-v2'
DEVICE = torch.device("cuda")

index = initialize_pinecone(API_KEY, INDEX_NAME)
model, embedding_model = load_models(MODEL_PATH, MODEL_FILE, EMBEDDING_MODEL_NAME, DEVICE)
model1 = load_query_refinement_model(MODEL_PATH, MODEL_FILE,DEVICE)
conversation_history = []


# Gradio interface run
with gr.Blocks() as demo:
    with gr.Column():
        chatbot = gr.Chatbot(label="Chat History")
        user_input = gr.Textbox(label="User Query")
        clear_button = gr.Button("New chat")

        # Use the generator function directly
        user_input.submit(gradio_interface, inputs=[user_input, chatbot], outputs=[chatbot, user_input])
        clear_button.click(fn=clear_history, outputs=[chatbot])

demo.launch(share=True, debug=False)