In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
from transformers import GPTJForCausalLM, GPT2Tokenizer
import torch

# Download NLTK tokenization data
nltk.download('punkt')

# Sample menu data
menu_data = [
    {
        "itemId": "008341a8-e73c-4400-9143-4521f9e1befd",
        "itemName": "Rava Kichadi",
        "description": "A healthy breakfast option made with semolina and vegetables lightly tempered with spices cooked to perfection",
        "subCategory": "South Indian Favorites",
        "specialInstructions": "MAKE IT LITTLE SPICY",
        "allergicInfo": "NUTS and FISH.",
        "price": "20"
    },
    {
        "itemId": "01cb3741-3755-4c98-a5ea-0262d1948d59",
        "itemName": "South Indian Thali",
        "description": "Steamed rice, sambar, rasam, kootu, poriyal, kuzhambu, yogurt, appalam, chapati, kurma, pickle & Sweet",
        "subCategory": "Thali's",
        "specialInstructions": "This is spl instruction",
        "allergicInfo": "The item has allergic content.",
        "price": "26"
    }
]

# Preprocess menu data
def preprocess_menu(menu_data):
    return [
        f"{item['itemName']} {item['description']} {item.get('specialInstructions', '')} {item.get('allergicInfo', '')} {item.get('price', '')}"
        for item in menu_data
    ]

# Tokenize the menu data
tokenized_menu = [word_tokenize(doc.lower()) for doc in preprocess_menu(menu_data)]

# Create BM25 object
bm25 = BM25Okapi(tokenized_menu)

def bm25_retrieve(query, top_n=3):
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    retrieved_items = [menu_data[i] for i in top_n_indices]
    return retrieved_items

# Load GPT-J model and tokenizer
model_name = "EleutherAI/gpt-j-6B"
model = GPTJForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def generate_response(context, query):
    input_text = f"Context:\n{context}\nQuestion: {query}\nAnswer:"

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt")

    # Adjust max_length based on input size
    input_length = inputs.input_ids.size(1)
    max_length = input_length + 100  # Adding extra tokens for generation

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,  # Increase max_length
            num_beams=4,
            temperature=0.7,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


def conversational_bot(query):
    # Retrieve relevant menu items using BM25
    retrieved_items = bm25_retrieve(query)

    # Format the context to be clear and concise
    context = "\n---\n".join([
        f"Item Name: {item['itemName']}\nDescription: {item['description']}\nSpecial Instructions: {item.get('specialInstructions', '')}\nAllergic Info: {item.get('allergicInfo', '')}\nPrice: {item.get('price', '')}"
        for item in retrieved_items
    ])

    # Generate response using GPT-J based on the context
    response = generate_response(context, query)

    return response

# Example queries
query1 = "give price of rava kichadi?"
response1 = conversational_bot(query1)
print("Response for query 1:", response1)

query2 = "does rava kichadi have nuts?"
response2 = conversational_bot(query2)
print("Response for query 2:", response2)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Response for query 1: Context:
Item Name: Rava Kichadi
Description: A healthy breakfast option made with semolina and vegetables lightly tempered with spices cooked to perfection
Special Instructions: MAKE IT LITTLE SPICY
Allergic Info: NUTS and FISH.
Price: 20
---
Item Name: South Indian Thali
Description: Steamed rice, sambar, rasam, kootu, poriyal, kuzhambu, yogurt, appalam, chapati, kurma, pickle & Sweet
Special Instructions: This is spl instruction
Allergic Info: The item has allergic content.
Price: 26
Question: give price of rava kichadi?
Answer: 20

Context:
Item Name: Rava Kichadi
Description: A healthy breakfast option made with semolina and vegetables lightly tempered with spices cooked to perfection
Special Instructions: MAKE IT LITTLE SPICY
Allergic Info: NUTS and FISH.
Price: 20
---
Item Name: South Indian Thali
Description: Steamed rice, sambar, rasam, kootu, poriyal, kuzhambu,
Response for query 2: Context:
Item Name: Rava Kichadi
Description: A healthy breakfast option

In [20]:
from collections import defaultdict
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
import nltk

# Download NLTK data for tokenization
nltk.download('punkt')

# Sample menu data (can be expanded with your larger dataset)
menu_data = [
    {
        "itemId": "1",
        "itemName": "Chicken Curry",
        "description": "Hot and spicy chicken dish",
        "specialInstructions": "Extra spicy",
        "allergicInfo": "Contains nuts",
        "price": "15"
    },
    {
        "itemId": "2",
        "itemName": "Mild Chicken Pasta",
        "description": "Creamy pasta with chicken",
        "specialInstructions": "No garlic",
        "allergicInfo": "Dairy",
        "price": "12"
    },
    {
        "itemId": "3",
        "itemName": "Spicy Pasta",
        "description": "Pasta with a spicy tomato sauce",
        "specialInstructions": "",
        "allergicInfo": "",
        "price": "10"
    }
]

# Step 1: Tokenize the documents (menu items)
def tokenize_document(document):
    return word_tokenize(document.lower())

# Step 2: Preprocess menu data into a list of combined item details
def preprocess_menu(menu_data):
    return [
        f"{item['itemName']} {item['description']} {item.get('specialInstructions', '')} {item.get('allergicInfo', '')} {item.get('price', '')}"
        for item in menu_data
    ]

# Step 3: Build an inverted index
def build_inverted_index(menu_data):
    inverted_index = defaultdict(list)
    for doc_id, item in enumerate(menu_data):
        # Tokenize each menu item
        text = f"{item['itemName']} {item['description']} {item.get('specialInstructions', '')} {item.get('allergicInfo', '')}"
        tokens = tokenize_document(text)
        for token in set(tokens):  # Use set to avoid duplicate entries for the same word
            inverted_index[token].append(doc_id)
    return inverted_index

# Step 4: Calculate document frequencies (DF) for each token in the inverted index
def document_frequencies(inverted_index):
    return {term: len(doc_ids) for term, doc_ids in inverted_index.items()}

# Step 5: Retrieve relevant documents using inverted index
def retrieve_documents(query, inverted_index):
    query_tokens = tokenize_document(query)
    relevant_docs = set()

    # Collect documents that contain at least one query term
    for token in query_tokens:
        if token in inverted_index:
            relevant_docs.update(inverted_index[token])

    return list(relevant_docs)

# Step 6: Rank the retrieved documents using BM25
def rank_documents(query, menu_data, relevant_doc_ids):
    # Preprocess the menu data
    preprocessed_menu = preprocess_menu(menu_data)

    # Tokenize the menu data for BM25
    tokenized_menu = [tokenize_document(doc) for doc in preprocessed_menu]

    # Create a BM25 object
    bm25 = BM25Okapi(tokenized_menu)

    # Tokenize the query
    tokenized_query = tokenize_document(query)

    # Get BM25 scores for relevant documents only
    scores = bm25.get_scores(tokenized_query)

    # Filter scores for only the relevant documents
    relevant_scores = [(doc_id, scores[doc_id]) for doc_id in relevant_doc_ids]

    # Sort documents by their BM25 score in descending order
    ranked_docs = sorted(relevant_scores, key=lambda x: x[1], reverse=True)

    # Return the ranked documents and their scores
    return ranked_docs

# Build the inverted index
inverted_index = build_inverted_index(menu_data)
doc_freqs = document_frequencies(inverted_index)

# Example query
query = "give me price of chicken?"

# Retrieve relevant documents based on the query
relevant_docs = retrieve_documents(query, inverted_index)
print(f"Relevant document IDs for query '{query}':", relevant_docs)

# Rank the retrieved documents using BM25
ranked_docs = rank_documents(query, menu_data, relevant_docs)
print(f"Ranked document IDs and scores for query '{query}':", ranked_docs)

# Display the ranked results (menu items)
print("\nRanked Menu Items:")
for doc_id, score in ranked_docs:
    print(f"Item Name: {menu_data[doc_id]['itemName']}, BM25 Score: {score}")


Relevant document IDs for query 'give me price of chicken?': [0, 1]
Ranked document IDs and scores for query 'give me price of chicken?': [(1, 0.11494217953147351), (0, 0.11161230952358747)]

Ranked Menu Items:
Item Name: Mild Chicken Pasta, BM25 Score: 0.11494217953147351
Item Name: Chicken Curry, BM25 Score: 0.11161230952358747


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
from transformers import GPTJForCausalLM, GPT2Tokenizer
import torch

# Load GPT-J model and tokenizer (make sure you have installed necessary packages and model)
model_name = "EleutherAI/gpt-j-6B"
model = GPTJForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def generate_gpt_response(query, context):
    input_text = f"Context:\n{context}\nUser Query: {query}\nAnswer:"

    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generate the response
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=150, num_beams=5, temperature=0.7, early_stopping=True)

    # Decode the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Generate context based on ranked documents
context = "\n".join([f"Item Name: {menu_data[doc_id]['itemName']}\nDescription: {menu_data[doc_id]['description']}\nPrice: {menu_data[doc_id]['price']}" for doc_id, _ in ranked_docs])

# Generate the GPT response
gpt_response = generate_gpt_response(query, context)
print(gpt_response)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context:
Item Name: Mild Chicken Pasta
Description: Creamy pasta with chicken
Price: 12
Item Name: Chicken Curry
Description: Hot and spicy chicken dish
Price: 15
User Query: give me price of chicken?
Answer: 12

Context:
Item Name: Mild Chicken Pasta
Description: Creamy pasta with chicken
Price: 12
Item Name: Chicken Curry
Description: Hot and spicy chicken dish
Price: 15
User Query: give me price of chicken?
Answer: 12

Context:
Item Name: Mild Chicken Pasta
Description: Creamy pasta with chicken
Price: 12
Item Name: Chicken Curry
Description: Hot and spicy chicken dish
Price


In [19]:
def generate_template_response(query, ranked_docs, menu_data):
    if not ranked_docs:
        return "Sorry, we don't have anything that matches your query."

    # Construct a response using the top-ranked items
    response = "Here are some items that might interest you:\n"

    for doc_id, score in ranked_docs:
        item = menu_data[doc_id]
        response += f"\n- {item['itemName']}: {item['description']} (Price: ${item['price']})"
        if item['specialInstructions']:
            response += f" | Special Instructions: {item['specialInstructions']}"
        if item['allergicInfo']:
            response += f" | Allergic Info: {item['allergicInfo']}"

    response += "\nWould you like more information about any of these?"
    return response

# Generate a response based on the ranked documents
response = generate_template_response(query, ranked_docs, menu_data)
print(response)


Here are some items that might interest you:

- Spicy Pasta: Pasta with a spicy tomato sauce (Price: $10)
- Mild Chicken Pasta: Creamy pasta with chicken (Price: $12) | Special Instructions: No garlic | Allergic Info: Dairy
Would you like more information about any of these?
