Using any OpenAI model or Groq (llama 70b) model, solve the aisle mapping problem.  OpenAI code is provided here. You will do the usual 3 steps in terms of mounting the Google drive, your API key and install one of the LLM models, and import pandas.  Your goal is to do a model comparison and validation.
**The code below works for OpenAI gpt-4o model.  Have not tested it on Llama model.  Also, this code with the dataset  may blow your budget if you are not careful with the size of test dataset, so exercise caution.**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip install -qqq -r "/content/drive/My Drive/LLMProjects/requirements.txt"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [52]:
from groq import Groq
import os

# Read and set the environment variable from the .bashrc file
with open('/content/drive/My Drive/LLMProjects/.bashrc') as file:
       for line in file:
                if line.startswith('export '):
                      var, value = line[len('export '):].strip().split('=')
                      os.environ[var] = value

In [53]:
client2 = Groq()

def get_completion2(prompt, model="llama3-8b-8192"):
    messages = [{"role": "user", "content": prompt}]
    response = client2.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [55]:
import openai
import os

# Path to your .bashrc file
bashrc_path = '/content/drive/MyDrive/LLMProjects/.bashrc'

# Read the .bashrc file and extract the API key
with open(bashrc_path, 'r') as file:
    lines = file.readlines()
    for line in lines:
        if line.startswith('export OPENAI_API_KEY='):
            openai_api_key = line.split('=')[1].strip().strip('"')

# Set the OpenAI API key
os.environ['OPENAI_API_KEY'] = openai_api_key

# Retrieve the OpenAI API key from the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key is None:
    raise ValueError("OpenAI API key not found. Please set it in the environment variables.")

# Set the OpenAI API key
openai.api_key = openai_api_key

# Now you can use the OpenAI API with the set API key
print(f"OpenAI API key has been set successfully: {openai_api_key}")

OpenAI API key has been set successfully: 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


In [56]:
client = openai.OpenAI()

def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

In [57]:
!pip install pandas



In [58]:
import pandas as pd
df = pd.read_excel('Aisle-Mapping.xlsx')

In [59]:
groceries = df['Grocery ITEM'].dropna().tolist()
aisles = df['Aisle Category'].dropna().tolist()

In [62]:
batch_size = 100  # Adjust the batch size as needed
separator=","

# Function to get the best matches for a batch of keywords using chat completion
def get_best_matches_batch(grocery_batch, aisle_list):
    prompt = "Match each grocery item in the  grocery list with the most appropriate aisle category from the provided list of aisle categories.\n\n"
    prompt += "The grocery list items are separated by commas. The list of aisles are also separated by commas. \n\n"
    prompt += "List of grocery items to match:\n" + "\n"+ "\n"+separator.join(grocery_batch)+  "\n\n"
    prompt += "List of provided aisle categories:\n" + "\n"+ "\n"+separator.join(aisle_list)+ "\n\n"
    prompt += "If an appropriate aisle category is not to be found in the list, use Other \n\n"
    prompt += "Return the matches in the format 'grocery item -> aisle category \n\n"
    prompt+="You must absolutely make sure that each grocery item is mapped to an aisle category"
    #print(prompt)

    matches = get_completion2(prompt)
    return matches

In [63]:
# Process the dataset in batches
results = []
num_batches = len(groceries) // batch_size + (1 if len(groceries) % batch_size != 0 else 0)

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, len(groceries))
    grocery_batch = groceries[start_idx:end_idx]

    matches = get_best_matches_batch(grocery_batch, aisles)
    results.append(matches)

    # Print the batch number
    print(f"Batch {i+1} processed")

# Combine all results
combined_results = "\n".join(results)

Batch 1 processed
Batch 2 processed
Batch 3 processed
Batch 4 processed
Batch 5 processed
Batch 6 processed
Batch 7 processed
Batch 8 processed
Batch 9 processed
Batch 10 processed
Batch 11 processed
Batch 12 processed
Batch 13 processed
Batch 14 processed
Batch 15 processed
Batch 16 processed
Batch 17 processed
Batch 18 processed
Batch 19 processed
Batch 20 processed
Batch 21 processed
Batch 22 processed
Batch 23 processed
Batch 24 processed
Batch 25 processed
Batch 26 processed
Batch 27 processed
Batch 28 processed
Batch 29 processed
Batch 30 processed
Batch 31 processed
Batch 32 processed
Batch 33 processed
Batch 34 processed
Batch 35 processed
Batch 36 processed
Batch 37 processed
Batch 38 processed
Batch 39 processed
Batch 40 processed
Batch 41 processed
Batch 42 processed
Batch 43 processed
Batch 44 processed
Batch 45 processed
Batch 46 processed
Batch 47 processed
Batch 48 processed
Batch 49 processed
Batch 50 processed
Batch 51 processed
Batch 52 processed
Batch 53 processed
Ba

#Extract the Output from the First Model into JSON format


In [65]:
import json

# Step 1: Parse the combined results
lines = combined_results.split("\n")

# Initialize the dictionary to store results and the counter
grocery_aisle_dict = {}
entry_count = 0

# Step 2: Store each line as a key-value pair in a dictionary and count the entries
for line in lines:
    if " -> " in line:  # Ensure the line contains the separator
        key, value = line.split(" -> ")
        grocery_aisle_dict[key.strip()] = value.strip()
        entry_count += 1

# Step 3: Convert the dictionary to a JSON string
output_json = json.dumps(grocery_aisle_dict, indent=4)

# Step 4: Store the JSON string in a variable
json_result_variable = output_json

# Print the total number of entries
print(f"Total number of entries: {entry_count}")

# Print confirmation that JSON is stored in variable
print("JSON result has been stored in the variable 'json_result_variable'.")

# Step 5: Print top 5 entries of the JSON
top_5_entries = list(grocery_aisle_dict.items())[:5]
print("\nTop 5 entries in the JSON:")
for item, aisle in top_5_entries:
    print(f"{item}: {aisle}")

# Print the number of entries in combined_results directly
combined_results_entries = len([line for line in lines if " -> " in line])
print(f"\nNumber of entries in combined_results: {combined_results_entries}")


Total number of entries: 34128
JSON result has been stored in the variable 'json_result_variable'.

Top 5 entries in the JSON:
1. tropical fruit: Produce
2. whole milk: Dairy
3. pip fruit: Produce
4. other vegetables: Produce
5. pot plants: Garden

Number of entries in combined_results: 34128


#Parse the JSON String:

In [69]:

grocery_aisle_dict = json.loads(output_json)


#Define the Validation Function:

In [84]:
def validate_pair(grocery_item, aisle_category):
    prompt = (
        f"Is it correct that the grocery item '{grocery_item}' belongs to the aisle '{aisle_category}'? "
        "Answer 'yes' or 'no' and provide a brief explanation if needed."
    )
    validation_result = get_completion(prompt)
    return validation_result



#Validate Each Pair in the Dictionary(Only top 50)

In [85]:
validation_results = {}
correct_count = 0
incorrect_count = 0
validation_limit = 50  # Set the validation limit

for i, (item, aisle) in enumerate(grocery_aisle_dict.items()):
    if i >= validation_limit:
        break  # Stop after validating 50 items

    # Validate the current pair
    validation_result = validate_pair(item, aisle)

    # Store the validation result
    validation_results[item] = validation_result

    # Print the validation result
    print(f"Validation for '{item}': {validation_result}")

    # Update counts based on the validation result
    if validation_result.strip().lower().startswith("yes"):
        correct_count += 1
    else:
        incorrect_count += 1




Validation for '1. tropical fruit': Yes, '1. tropical fruit' belongs to the aisle 'Produce'. Tropical fruits are typically fresh fruits that are found in the produce section of a grocery store.
Validation for '2. whole milk': Yes, the grocery item '2. whole milk' belongs to the aisle 'Dairy'. Whole milk is a dairy product, so it would typically be found in the Dairy aisle of a grocery store.
Validation for '3. pip fruit': Yes, the grocery item '3. pip fruit' belongs to the aisle 'Produce'. Pip fruit refers to fruits that contain seeds or pips, such as apples, pears, and plums, which are typically found in the Produce aisle of a grocery store.
Validation for '4. other vegetables': Yes, the grocery item '4. other vegetables' belongs to the aisle 'Produce'. This is because 'other vegetables' are typically found in the produce section of a grocery store along with fruits and other fresh produce items.
Validation for '5. pot plants': Yes, the grocery item '5. pot plants' belongs to the aisl

#Store and Print Summary for 50 data points:




In [86]:
validation_results_json = json.dumps(validation_results, indent=4)
validation_results_variable = validation_results_json

# Print the total number of validated entries
print(f"\nTotal number of validated entries: {len(validation_results)}")

# Print the number of correctly and incorrectly validated entries
print(f"Number of correct validations: {correct_count}")
print(f"Number of incorrect validations: {incorrect_count}")

# Print confirmation that validation results are stored in a variable
print("\nValidation results have been stored in the variable 'validation_results_variable'.")



Total number of validated entries: 50
Number of correct validations: 47
Number of incorrect validations: 3

Validation results have been stored in the variable 'validation_results_variable'.
