Using any OpenAI model or Groq (llama 70b) model, solve the aisle mapping problem.  OpenAI code is provided here. You will do the usual 3 steps in terms of mounting the Google drive, your API key and install one of the LLM models, and import pandas.  Your goal is to do a model comparison and validation.
**The code below works for OpenAI gpt-4o model.  Have not tested it on Llama model.  Also, this code with the dataset  may blow your budget if you are not careful with the size of test dataset, so exercise caution.**

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!pip install -qqq -r "/content/drive/My Drive/LLMProjects/requirements.txt"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [3]:
from groq import Groq
import os

# Read and set the environment variable from the .bashrc file
with open('/content/drive/My Drive/LLMProjects/.bashrc') as file:
       for line in file:
                if line.startswith('export '):
                      var, value = line[len('export '):].strip().split('=')
                      os.environ[var] = value

In [17]:
client2 = Groq()

def get_completion(prompt, model="llama3-8b-8192"):
    messages = [{"role": "user", "content": prompt}]
    response = client2.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [18]:
import openai
import os

# Path to your .bashrc file
bashrc_path = '/content/drive/MyDrive/LLMProjects/.bashrc'

# Read the .bashrc file and extract the API key
with open(bashrc_path, 'r') as file:
    lines = file.readlines()
    for line in lines:
        if line.startswith('export OPENAI_API_KEY='):
            openai_api_key = line.split('=')[1].strip().strip('"')

# Set the OpenAI API key
os.environ['OPENAI_API_KEY'] = openai_api_key

# Retrieve the OpenAI API key from the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key is None:
    raise ValueError("OpenAI API key not found. Please set it in the environment variables.")

# Set the OpenAI API key
openai.api_key = openai_api_key

# Now you can use the OpenAI API with the set API key
print(f"OpenAI API key has been set successfully: {openai_api_key}")

OpenAI API key has been set successfully: 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


In [19]:
!pip install pandas



In [20]:
import pandas as pd
df = pd.read_excel('Aisle-Mapping.xlsx')

In [21]:
groceries = df['Grocery ITEM'].dropna().tolist()
aisles = df['Aisle Category'].dropna().tolist()

In [22]:
batch_size = 50  # Adjust the batch size as needed
separator=","
client = openai.OpenAI()

# Function to get the best matches for a batch of keywords using chat completion
def get_best_matches_batch(grocery_batch, aisle_list):
    prompt = "Match each grocery item in the  grocery list with the most appropriate aisle category from the provided list of aisle categories.\n\n"
    prompt += "The grocery list items are separated by commas. The list of aisles are also separated by commas. \n\n"
    prompt += "List of grocery items to match:\n" + "\n"+ "\n"+separator.join(grocery_batch)+  "\n\n"
    prompt += "List of provided aisle categories:\n" + "\n"+ "\n"+separator.join(aisle_list)+ "\n\n"
    prompt += "If an appropriate aisle category is not to be found in the list, use Other \n\n"
    prompt += "Return the matches in the format 'grocery item -> aisle category \n\n"
    prompt+="You must absolutely make sure that each grocery item is mapped to an aisle category"
    #print(prompt)

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that matches grocery items to aisle categories based on a typical grocery store or a supermarket in the USA."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=4096,
        temperature=0.0
    )
    matches = response.choices[0].message.content
    return matches

In [None]:
# Process the dataset in batches
results = []
num_batches = len(groceries) // batch_size + (1 if len(groceries) % batch_size != 0 else 0)

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, len(groceries))
    grocery_batch = groceries[start_idx:end_idx]

    matches = get_best_matches_batch(grocery_batch, aisles)
    results.append(matches)

    # Print the batch number
    print(f"Batch {i+1} processed")

# Combine all results
combined_results = "\n".join(results)

# Convert to JSON and store the results in a variable
output_json = {"matches": combined_results}

# Print only the top 50 results
top_50_results = combined_results.split("\n")[:50]
for idx, result in enumerate(top_50_results):
    print(f"Result {idx+1}: {result}")



Batch 1 processed
Batch 2 processed
Batch 3 processed
Batch 4 processed
Batch 5 processed
Batch 6 processed
Batch 7 processed
Batch 8 processed
Batch 9 processed
Batch 10 processed
Batch 11 processed
Batch 12 processed
Batch 13 processed
Batch 14 processed
Batch 15 processed
Batch 16 processed
Batch 17 processed
Batch 18 processed
Batch 19 processed
Batch 20 processed
Batch 21 processed
Batch 22 processed
Batch 23 processed
Batch 24 processed
Batch 25 processed
Batch 26 processed
Batch 27 processed
Batch 28 processed
Batch 29 processed
Batch 30 processed
Batch 31 processed
Batch 32 processed
Batch 33 processed
Batch 34 processed
Batch 35 processed
Batch 36 processed
Batch 37 processed
Batch 38 processed
Batch 39 processed
Batch 40 processed
Batch 41 processed
Batch 42 processed
Batch 43 processed
Batch 44 processed
Batch 45 processed
Batch 46 processed
Batch 47 processed
Batch 48 processed
Batch 49 processed
Batch 50 processed
Batch 51 processed
Batch 52 processed
Batch 53 processed
Ba

#Extract the Output from the First Model


In [None]:
import json

# Example output from model 1 stored in a variable
output_json = {
    "matches": "apple -> fruits\nbanana -> fruits\ncarrot -> vegetables\nmilk -> dairy\nbread -> bakery\nchicken -> meat\n"
}

# Parse the output into a dictionary
matches = output_json['matches'].split("\n")
key_value_pairs = [match.split(" -> ") for match in matches if match]
grocery_aisle_dict = {kv[0]: kv[1] for kv in key_value_pairs}


#Suitable Prompt for Validation Using Groc

In [None]:
# Function to validate each (key, value) pair
def validate_pair(grocery_item, aisle_category):
    prompt = (
        f"Is it correct that the grocery item '{grocery_item}' belongs to the aisle '{aisle_category}'? "
        "Answer 'yes' or 'no' and provide a brief explanation if needed."
    )

    response = get_completion(prompt)

    validation_result = response.choices[0].message['content']
    return validation_result




#Validate Each (Key, Value) Pair

In [None]:
# Validate each (key, value) pair
validation_results = {}

for grocery_item, aisle_category in grocery_aisle_dict.items():
    validation_result = validate_pair(grocery_item, aisle_category)
    validation_results[grocery_item] = validation_result

# Print validation results
for grocery_item, result in validation_results.items():
    print(f"{grocery_item} -> {result}")


#Observations Based on the Validation Results

In [None]:
# Count the validation results
correct_validations = sum(1 for result in validation_results.values() if "yes" in result.lower())
incorrect_validations = len(validation_results) - correct_validations

print(f"Correct Validations: {correct_validations}")
print(f"Incorrect Validations: {incorrect_validations}")
# Calculate percentages
total_validations = len(validation_results)
correct_percentage = (correct_validations / total_validations) * 100
incorrect_percentage = (incorrect_validations / total_validations) * 100

# Print percentages
print(f"Correct Validations: {correct_percentage:.2f}%")
print(f"Incorrect Validations: {incorrect_percentage:.2f}%")
