# Assignment 4: APIs and Data Enrichment with LLMs

## Task I - Complete Hugging Face and Google Places API setups

In [None]:
# Google Places API setup

from google.colab import userdata
API_KEY = userdata.get('google_places')

In [None]:
import requests

In [None]:
# Hugging face setup to import the LLama model

import torch
from transformers import pipeline

HF_TOKEN = userdata.get('HF_TOKEN') # Your token must be in this secret.

pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto",token=HF_TOKEN)


## Task II - Call the Google Places API

In [None]:
# Selecting 3 businesses of common type
business_names = ["Little Caesars Pizza", "Burger King", "Del Taco"]

# Defining a function to get reviews of businesses based on their names

def get_reviews(business):
  find_place_url = f"https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input={business}&inputtype=textquery&key={API_KEY}"
  response = requests.get(find_place_url)
  json_response = response.json()
  place_id = json_response["candidates"][0]["place_id"]
  details_url = f"https://maps.googleapis.com/maps/api/place/details/json?place_id={place_id}&fields=reviews&key={API_KEY}"
  response = requests.get(details_url)
  reviews = response.json().get('result', {}).get('reviews', [])

# Add business name to each review
  for review in reviews:
        review['business_name'] = business  # Add the business name to each review dictionary
  return reviews

# Getting reviews of all three businesses along with the business name
reviews = []
for business in business_names:
  reviews.extend(get_reviews(business))

  # Printing the reviews
for review in reviews:
  print(review)

{'author_name': 'Michael Diblasio', 'author_url': 'https://www.google.com/maps/contrib/102108617232083174319/reviews', 'language': 'en', 'original_language': 'en', 'profile_photo_url': 'https://lh3.googleusercontent.com/a/ACg8ocI1r9UaewsTG6UkiDdvEbLtkhMjgkclRbLkK6KRsp8dKJgXJA=s128-c0x00000000-cc-rp-mo', 'rating': 1, 'relative_time_description': 'in the last week', 'text': 'Gotta love it. So I order for delivery. Tip 25%. They forgot the 2 liter soda. I call the location,  they tell me I need to drive down to pick it up..... First and last time ordering here.', 'time': 1726426023, 'translated': False, 'business_name': 'Little Caesars Pizza'}
{'author_name': 'Linda Pallotta', 'author_url': 'https://www.google.com/maps/contrib/105647517702372625243/reviews', 'language': 'en', 'original_language': 'en', 'profile_photo_url': 'https://lh3.googleusercontent.com/a/ACg8ocLqQzUFp4HuPnk4ucVGFIJuCT7vui9Uj0y4bjDAq-1MkTpYUw=s128-c0x00000000-cc-rp-mo-ba6', 'rating': 3, 'relative_time_description': '5

In [None]:
# Printing the reviews in a clean dictionary format and selecting only relevant columns
for review in reviews:
    print({
        'business_name': review['business_name'],
        'author_name': review.get('author_name', ''),
        'author_url': review.get('author_url', ''),
        'rating': review.get('rating', ''),
        'review_text': review.get('text', ''),
        'time_description': review.get('relative_time_description', '')
    })

{'business_name': 'Little Caesars Pizza', 'author_name': 'Michael Diblasio', 'author_url': 'https://www.google.com/maps/contrib/102108617232083174319/reviews', 'rating': 1, 'review_text': 'Gotta love it. So I order for delivery. Tip 25%. They forgot the 2 liter soda. I call the location,  they tell me I need to drive down to pick it up..... First and last time ordering here.', 'time_description': 'in the last week'}
{'business_name': 'Little Caesars Pizza', 'author_name': 'Linda Pallotta', 'author_url': 'https://www.google.com/maps/contrib/105647517702372625243/reviews', 'rating': 3, 'review_text': "The employee who helped us didn't have a very professional attitude. At all the other Little Caesar establishments that we've been to, we were always able to get all the flat style chicken wings in our order and none of the drum stick style. She said that they couldn't do it and gave a lame excuse. The taste was a bit off, too. Other than that, things were OK. Our bread sticks were great, a

## Task III - Data Enrichment

In [None]:
# Testing it on one review first

review_1 = reviews[0]['text']
print(review_1)

# Defining chat or prompt to get critical structured information from each review

chat = [
    {"role": "system", "content": "What critical structured information can we gather from this review? What was the customer's sentiment? Return three strings one for the critical structured information (which should contain three phrases not more), one for predicted rating (from 1 to 5 with 1 as very bad and 5 as very good) and one for the customer's sentiment which can be one of the following: (Very Good, Good, Neutral, Bad, Very Bad)"},
    {"role": "user", "content": review_1}
]
print(chat)



chat_response = pipe(chat, max_new_tokens=512)
critical_information_string = chat_response[0]['generated_text'][-1]['content']
print(critical_information_string)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Gotta love it. So I order for delivery. Tip 25%. They forgot the 2 liter soda. I call the location,  they tell me I need to drive down to pick it up..... First and last time ordering here.
[{'role': 'system', 'content': "What critical structured information can we gather from this review? What was the customer's sentiment? Return three strings one for the critical structured information (which should contain three phrases not more), one for predicted rating (from 1 to 5 with 1 as very bad and 5 as very good) and one for the customer's sentiment which can be one of the following: (Very Good, Good, Neutral, Bad, Very Bad)"}, {'role': 'user', 'content': 'Gotta love it. So I order for delivery. Tip 25%. They forgot the 2 liter soda. I call the location,  they tell me I need to drive down to pick it up..... First and last time ordering here.'}]
Here are the critical structured information, predicted rating, and customer's sentiment:

**Critical Structured Information:** "Order missing item"

In [None]:
# Creating a function now to do this for every review

def get_critical_information(review):
    chat = [
    {"role": "system", "content": "What critical structured information can we gather from this review? What was the customer's sentiment? Return three strings one for the critical structured information (which should contain three phrases not more), one for predicted rating (from 1 to 5 with 1 as very bad and 5 as very good) and one for the customer's sentiment which can be one of the following: (Very Good, Good, Neutral, Bad, Very Bad)"},
    {"role": "user", "content": review}
    ]
    chat_response = pipe(chat, max_new_tokens=512)
    critical_information_string = chat_response[0]['generated_text'][-1]['content']
    return critical_information_string

# Extracting text/review from reviews dictionary
reviews_list = [review['text'] for review in reviews]

# Calling the function

summarized_reviews = []
for review in reviews_list:
    if review.strip():  # Skip empty lines
        summary = get_critical_information(review)
        summarized_reviews.append(summary)

# Display summarized reviews
for sr in summarized_reviews:
    print(sr)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_to

Here are the critical structured information, predicted rating, and customer's sentiment:

**Critical Structured Information:** "Forgot the 2 liter soda", "Need to drive down to pick it up", "First and last time ordering here"

**Predicted Rating:** 2

**Customer's Sentiment:** Bad
Here are the requested strings:

**Critical Structured Information:** "Unprofessional attitude", "Lame excuse", "Taste was a bit off"

**Predicted Rating:** 3

**Customer's Sentiment:** Bad
Here are the requested strings:

**Critical Structured Information:** Friendly staff, affordable prices, good pizza

**Predicted Rating:** 4

**Customer's Sentiment:** Good
Here are the critical structured information, predicted rating, and customer's sentiment:

**Critical Structured Information:** "Excellent management on dayshift", "Rude manager on night shift", "Difficulty ordering pizza on night shift"

**Predicted Rating:** 3

**Customer's Sentiment:** Bad
Here are the critical structured information, predicted rati

In [None]:
# Defining a function to extract the three columns from summarized_reviews
def extract_critical_information_simple(summarized_text):
    # Split the text by newline characters
    lines = summarized_text.split('\n')

    # Initializing variables to hold the extracted data
    critical_info = None
    predicted_rating = None
    sentiment = None

    # Looping over lines and extract relevant information based on keywords
    for line in lines:
        if "Critical Structured Information:" in line:
            critical_info = line.split("Critical Structured Information:")[-1].strip()
        elif "Predicted Rating:" in line:
            predicted_rating = line.split("Predicted Rating:")[-1].strip()
        elif "Customer's Sentiment:" in line:
            sentiment = line.split("Customer's Sentiment:")[-1].strip()

    return critical_info, predicted_rating, sentiment

# Testing the function on a sample summarized review
sample_summarized_review = summarized_reviews[0]
print(extract_critical_information_simple(sample_summarized_review))

('** "Forgot the 2 liter soda", "Need to drive down to pick it up", "First and last time ordering here"', '** 2', '** Bad')


In [None]:
# Creating lists to store the extracted information
critical_info_list = []
predicted_rating_list = []
sentiment_list = []

# Looping over each summarized review and extracting the relevant information
for review in summarized_reviews:
    critical_info, predicted_rating, sentiment = extract_critical_information_simple(review)
    critical_info_list.append(critical_info)
    predicted_rating_list.append(predicted_rating)
    sentiment_list.append(sentiment)

# Creating a new dataframe with the extracted columns
structured_reviews_df = pd.DataFrame({
    'Critical Structured Information': critical_info_list,
    'Predicted Rating': predicted_rating_list,
    'Customer\'s Sentiment': sentiment_list
})

# Displaying the structured reviews dataframe
print(structured_reviews_df.head())

                     Critical Structured Information  \
0  ** "Forgot the 2 liter soda", "Need to drive d...   
1  ** "Unprofessional attitude", "Lame excuse", "...   
2   ** Friendly staff, affordable prices, good pizza   
3  ** "Excellent management on dayshift", "Rude m...   
4  ** "Pizza bites are good", "Cheese pizza is ok...   

                                    Predicted Rating  \
0                                               ** 2   
1                                               ** 3   
2                                               ** 4   
3                                               ** 3   
4  ** 3 (The customer seemed to enjoy the pizza b...   

                                Customer's Sentiment  
0                                             ** Bad  
1                                             ** Bad  
2                                            ** Good  
3                                             ** Bad  
4  ** Good (The customer had some positive commen..

In [None]:
import pandas as pd

# Creating a dataframe for reviews
reviews_df = pd.DataFrame(reviews)

# Creating a matching index for both dataframes
reviews_df.reset_index(inplace=True)  # Ensure both dataframes have a matching index.
structured_reviews_df.reset_index(inplace=True)

# Merging the two dataframes based on index
combined_df = pd.merge(reviews_df, structured_reviews_df, on='index')

# Exploding the 'Critical Structured Information' column
# First, convert the critical information column to a list (if it's a string)
combined_df['Critical Structured Information'] = combined_df['Critical Structured Information'].apply(lambda x: x.split(", "))

# Performing the explode operation to create a new row for each item in the 'Critical Structured Information'
exploded_df = combined_df.explode('Critical Structured Information')

# Final dataframe contains all reviews and exploded critical information
print(exploded_df.head())

   index       author_name                                         author_url  \
0      0  Michael Diblasio  https://www.google.com/maps/contrib/1021086172...   
0      0  Michael Diblasio  https://www.google.com/maps/contrib/1021086172...   
0      0  Michael Diblasio  https://www.google.com/maps/contrib/1021086172...   
1      1    Linda Pallotta  https://www.google.com/maps/contrib/1056475177...   
1      1    Linda Pallotta  https://www.google.com/maps/contrib/1056475177...   

  language original_language  \
0       en                en   
0       en                en   
0       en                en   
1       en                en   
1       en                en   

                                   profile_photo_url  rating  \
0  https://lh3.googleusercontent.com/a/ACg8ocI1r9...       1   
0  https://lh3.googleusercontent.com/a/ACg8ocI1r9...       1   
0  https://lh3.googleusercontent.com/a/ACg8ocI1r9...       1   
1  https://lh3.googleusercontent.com/a/ACg8ocLqQz...       3   


## Task IV - Load the data

In [None]:
import sqlite3

# Creating a connection
conn = sqlite3.connect('reviews_database.db')

# Creating a new table to store the structured review data
exploded_df.to_sql('reviews', conn, if_exists='replace', index=False)

# Commit the changes
conn.commit()

print("Data successfully loaded into SQLite database!")


Data successfully loaded into SQLite database!


## Task V - Query the enriched database

In [None]:
# Querying the data from the reviews table
query = "SELECT * FROM reviews"
queried_df = pd.read_sql(query, conn)

# Close the connection
conn.close()

# Display the result
print(queried_df.head())

   index       author_name                                         author_url  \
0      0  Michael Diblasio  https://www.google.com/maps/contrib/1021086172...   
1      0  Michael Diblasio  https://www.google.com/maps/contrib/1021086172...   
2      0  Michael Diblasio  https://www.google.com/maps/contrib/1021086172...   
3      1    Linda Pallotta  https://www.google.com/maps/contrib/1056475177...   
4      1    Linda Pallotta  https://www.google.com/maps/contrib/1056475177...   

  language original_language  \
0       en                en   
1       en                en   
2       en                en   
3       en                en   
4       en                en   

                                   profile_photo_url  rating  \
0  https://lh3.googleusercontent.com/a/ACg8ocI1r9...       1   
1  https://lh3.googleusercontent.com/a/ACg8ocI1r9...       1   
2  https://lh3.googleusercontent.com/a/ACg8ocI1r9...       1   
3  https://lh3.googleusercontent.com/a/ACg8ocLqQz...       3   


In [None]:
# Mount Google Drive
import os
from google.colab import drive
drive.mount('/content/drive')

!cp "/content/drive/MyDrive/Colab Notebooks/de_lab_4_Ahmad_Ahsan.ipynb" ./
!jupyter nbconvert --to html "de_lab_4_Ahmad_Ahsan.ipynb"

Mounted at /content/drive
[NbConvertApp] Converting notebook de_lab_4_Ahmad_Ahsan.ipynb to html
[NbConvertApp] Writing 650839 bytes to de_lab_4_Ahmad_Ahsan.html


I can't convert this notebook to html, it is giving me an error: "KeyError: 'state' so I am trying to remove the widget.