In [15]:
import pandas as pd
import requests
import gzip
import json
import io

Read json file in

In [16]:
url = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/review-Vermont_10.json.gz'

response = requests.get(url, stream = True)
response.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response.content), mode = 'rb') as gz_file:
    data_list = [json.loads(line) for line in gz_file]

vt = pd.DataFrame(data_list)

In [17]:
print(vt.shape)
print(vt.columns)
# print(vt.head())

(324725, 8)
Index(['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'], dtype='object')


In [18]:
url_metadata = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/meta-Vermont.json.gz'

response_metadata = requests.get(url_metadata, stream = True)
response_metadata.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response_metadata.content), mode = 'rb') as gz_file:
    data_list1 = [json.loads(line) for line in gz_file]

vt_metadata = pd.DataFrame(data_list1)

In [19]:
print(vt_metadata.shape)
print(vt_metadata.columns)
# print(vt_metadata.head())

(11291, 15)
Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')


Data Cleaning

For business data
Drop 'address', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results' column, convert all text to lower case, drop duplicates (around 50 duplicates)

In [20]:
print(vt_metadata.shape)
print(vt_metadata.columns)
vt_metadata = vt_metadata.drop(['address', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results'], axis = 1)
vt_metadata.columns = vt_metadata.columns.str.lower() 
vt_metadata['description'] = vt_metadata['description'].str.replace(r'\s+', ' ', regex=True)
vt_metadata['category'] = vt_metadata['category'].str.replace(r'\s+', ' ', regex=True)
vt_metadata = vt_metadata.drop_duplicates(subset = ['name','gmap_id', 'gmap_id'])

(11291, 15)
Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')


For Review data drop 'name' column, convert all text to lower case, drop duplicates (~3000 duplicates)

In [21]:
vt = vt.drop(['name', 'resp'], axis = 1)
vt.columns = vt.columns.str.lower() 
vt['text'] = vt['text'].str.replace(r'\s+', ' ', regex=True)
vt = vt.drop_duplicates(subset = ['user_id', 'text', 'gmap_id', 'time'])

Change the format of the 'pics' column, merge multiple dictionaries into one with list of pic urls

In [22]:
def collapse_pics(pic_list):
    if not pic_list:
        return []  
    urls = []
    for pic_dict in pic_list:
        urls.extend(pic_dict.get('url', []))
    return urls

vt['pics_collapsed'] = vt['pics'].apply(collapse_pics)

Merge by gmap id

In [23]:
vt_merged = pd.merge(vt, vt_metadata, on = 'gmap_id', how = 'inner')
vt_merged['review_id'] = range(len(vt_merged))
column = vt_merged.pop('review_id')
vt_merged.insert(0, 'review_id', column)
print(vt_merged.head(2))

   review_id                user_id           time  rating  \
0          0  118026874392842649478  1620085852324       5   
1          1  101532740754036204131  1580309946474       5   

                                                text  pics  \
0      Always done right from wood stove to screens!  None   
1  A great company to work with. Their sales and ...  None   

                                 gmap_id pics_collapsed         name  \
0  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   
1  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   

  description  category                                                url  
0        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  
1        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  


Separate dataset into vt1 (rating only) and vt2 (everything else)

In [24]:
vt_text_merged = vt_merged[vt_merged['text'].notna()] # has text
vt_pics_merged = vt_merged[vt_merged['pics'].notna()] # has pic

vt_text_merged = vt_text_merged.drop(['pics'], axis = 1)
vt_pics_merged = vt_pics_merged.drop(['pics'], axis = 1)

print(vt_text_merged.head(2))
print(vt_pics_merged.head(2))

   review_id                user_id           time  rating  \
0          0  118026874392842649478  1620085852324       5   
1          1  101532740754036204131  1580309946474       5   

                                                text  \
0      Always done right from wood stove to screens!   
1  A great company to work with. Their sales and ...   

                                 gmap_id pics_collapsed         name  \
0  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   
1  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   

  description  category                                                url  
0        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  
1        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  
    review_id                user_id           time  rating  \
17         17  101856865551768948430  1606162783440       1   
78         78  107787438275893909028  1542050139179       5   

      

In [25]:
vt_text_merged.to_csv('vermont_text_merged.csv')
vt_pics_merged.to_csv('vermont_pics_merged.csv')

In [26]:
text = pd.read_csv('vermont_text_merged.csv')
pics = pd.read_csv('vermont_pics_merged.csv')

text = text.drop(text.columns[0], axis = 1)
pics = pics.drop(pics.columns[0], axis = 1)

print(text.head(3))
print(pics.head(3))

   review_id                user_id           time  rating  \
0          0  118026874392842649478  1620085852324       5   
1          1  101532740754036204131  1580309946474       5   
2          2  115404122636203550540  1605195974445       5   

                                                text  \
0      Always done right from wood stove to screens!   
1  A great company to work with. Their sales and ...   
2  Great place to do business with staff was grea...   

                                 gmap_id pics_collapsed         name  \
0  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   
1  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   
2  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   

  description  category                                                url  
0         NaN       NaN  https://www.google.com/maps/place//data=!4m2!3...  
1         NaN       NaN  https://www.google.com/maps/place//data=!4m2!3...  
2     

Detecting Advertisement from image

In [27]:
!pip3 install -q -U transformers
!pip3 install -q -U torch
!pip3 install -q -U huggingface_hub
!pip3 install -q -U dotenv

In [28]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

HTTPError: Invalid user token. The token from HF_TOKEN environment variable is invalid. Note that HF_TOKEN takes precedence over `hf auth login`.

In [None]:
from transformers import pipeline
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

model_id = "google/gemma-3-4b-it"

pipe = pipeline(
    "image-text-to-text",
    model=model_id,
    device=device,          
    torch_dtype=torch.bfloat16
)

Using device: mps


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use mps


In [None]:
messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": (
                "You are a potential customer looking at Google review images. "
                # "Always respond in two clearly labeled sections:\n"
                # "Answer: <short, factual yes/no/N/A>\n"
                # "Rationale: <explain based on visible features>"
            )}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://lh5.googleusercontent.com/p/AF1QipMBzN4BJV9YCObcw_ifNzFPm-u38hO3oimOA8Fb=w150-h150-k-no-p"},
            {"type": "text", "text": (
                "Describe the image in detail."
            )}
        ]
    }
]

output = pipe(text=messages, max_new_tokens=128)
print(output[0]["generated_text"][-1]["content"])

Okay, here's a detailed description of the image I’m seeing:

**Overall Impression:** The image appears to be an interior shot of what looks like a reception or waiting area, possibly for a medical or wellness practice. It has a modern and somewhat minimalist aesthetic.

**Key Elements:**

*   **Ceiling:** The ceiling is a neutral gray, with a slightly textured or popcorn finish. There are recessed lighting fixtures evenly spaced across the ceiling.
*   **Lighting:** There are four pendant lights hanging down from the ceiling. They have a distinctive design – a dark base with a light wooden or painted top.


In [None]:
advertisement_examples = [
    "Billboard with product name and price",
    "Social media post promoting a sale",
    "Banner showing a company logo with a slogan",
    "Flyer with a discount coupon"
]

messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": (
                "You are an AI assistant that classifies images as advertisements. "
                "Always respond in two clearly labeled sections:\n"
                "Answer: <Yes/No/N/A>\n"
                "Rationale: <Explain based on visible features in the image>\n\n"
                f"Examples of advertisements include: {', '.join(advertisement_examples)}"
            )}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://images.prismic.io/dive/ZqDG1R5LeNNTxcyj_Creative-Blogcover-.png?auto=format,compress"},  
            {"type": "text", "text": (
                "Is this image an advertisement? If it is not clear, say Answer: N/A."
            )}
        ]
    }
]

output = pipe(text=messages, max_new_tokens=128)

print(output[0]["generated_text"][-1]["content"])

Answer: Yes
Rationale: The image contains text "MONTHLY MUST SHARE PROMO CODES" which is a promotional message encouraging sharing of discount codes. This strongly indicates that the image is an advertisement, likely for a sale or discount promotion.


In [None]:
pics = pics.head(3)

pics['Is_Advertisement'] = ""
pics['Reasoning_for_Ad'] = ""

print(pics)

   review_id                user_id           time  rating  \
0         17  101856865551768948430  1606162783440       1   
1         78  107787438275893909028  1542050139179       5   
2         85  108564695687320799964  1518887224225       5   

                                                text  \
0  I had them replace two broken studs on my righ...   
1  Kingdom Recovery Center is a place for people ...   
2                  Great place to ski and snow shoe.   

                                 gmap_id  \
0  0x89e024446398691f:0x4011d3a0f8636036   
1  0x4cb446270577ed85:0x8b3dfc93cc4846a8   
2  0x4cb43730db4d1a1f:0x82fc64269bef0db3   

                                      pics_collapsed                     name  \
0  ['https://lh5.googleusercontent.com/p/AF1QipPQ...                    Midas   
1  ['https://lh5.googleusercontent.com/p/AF1QipP6...  Kingdom Recovery Center   
2  ['https://lh5.googleusercontent.com/p/AF1QipPH...   Outing Club Skate Park   

  description  category 

In [None]:
# Take first 3 reviews
pics_test = pics.head(3).copy()

# Add columns if missing
for col in ["Image_Description", "Is_Advertisement"]:
    if col not in pics.columns:
        pics[col] = ""

advertisement_examples = [
    "Billboard with product name and price",
    "Social media post promoting a sale",
    "Banner showing a company logo with a slogan",
    "Flyer with a discount coupon"
]

for idx, row in pics_test.iterrows():
    pics_list = ast.literal_eval(row['pics_collapsed'])

    answers = []
    descriptions = []
    
    for image_url in pics_list:
        messages = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": (
                        "You are an AI assistant that classifies images as advertisements. "
                        "Always respond in two clearly labeled sections:\n"
                        "Answer: <Yes/No/N/A>\n"
                        "Description: Two sentences to describe what’s happening in the photo, "
                        "and hence the rationale.\n"
                        f"Examples of advertisements include: {', '.join(advertisement_examples)}"
                    )}
                ]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": image_url},
                    {"type": "text", "text": "Is this image an advertisement? If unclear, say Answer: N/A."}
                ]
            }
        ]

        output = pipe(text=messages, max_new_tokens=128)
        response = output[0]["generated_text"][-1]["content"]

        # Default values
        answer, rationale = "N/A", ""

        for line in response.splitlines():
            if line.startswith("Answer:"):
                answer = line.split(":", 1)[1].strip()
            elif line.startswith("Description:"):
                rationale = line.split(":", 1)[1].strip()

        answers.append(answer)
        descriptions.append(rationale)

    # Final decision rule
    if "Yes" in answers:
        final_answer = "Yes"
    elif "No" in answers:
        final_answer = "No"
    else:
        final_answer = "N/A"

    pics.at[idx, 'Is_Advertisement'] = final_answer
    pics.at[idx, 'Image_Description'] = " | ".join(descriptions)

print(pics[['user_id', 'Image_Description', 'Is_Advertisement']].head(3))

                 user_id                                   Full_Description  \
0  101856865551768948430  The image shows a close-up of a rusty motorcyc...   
1  107787438275893909028  The image shows a family holding a flag of Col...   
2  108564695687320799964  The image shows a person skiing in a snowy lan...   

  Is_Advertisement  
0               No  
1              Yes  
2               No  


In [None]:
# Use .loc to avoid SettingWithCopyWarning
pics_test.loc[:, 'Key_Words'] = ""

# Prompt template
PROMPT = """You are an image analyst. 
Describe this image in detail in the following structured format only:

- Setting: <indoor/outdoor, type of place>
- Objects: <things visible>
- Business type: <restaurant, shop, etc., best guess or N/A>
- Activities: <what is happening>
- Text: <any text visible in the image or N/A>

Do not include any extra commentary, questions, or greetings.  
If unsure about a category, write 'N/A'.
Here is the image: <image>
"""

# Loop through first 3 rows
for idx, row in pics_test.iterrows():
    pics_list = ast.literal_eval(row['pics_collapsed'])  # convert string to list
    descriptions = []

    for i, image_url in enumerate(pics_list, start=1):
        messages = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": PROMPT}
                ]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": image_url},
                    {"type": "text", "text": "Describe the image in the structured format above."}
                ]
            }
        ]

        try:
            output = pipe(text=messages, max_new_tokens=200)
            desc = output[0]["generated_text"][-1]["content"]
            descriptions.append(f"--- Image {i} ---\n{desc}")
        except Exception as e:
            descriptions.append(f"--- Image {i} ---\nN/A (Error processing image: {e})")

    # Combine multiple images into a single string per review
    pics_test.loc[idx, 'Key_Words'] = "\n\n".join(descriptions)

# View results
for idx, row in pics_test.iterrows():
    print(f"Review {idx}:")
    print(row['Key_Words'])
    print("="*80)

Review 0:
--- Image 1 ---
- Setting: Outdoor, garage
- Objects: Brake rotor, bolts, wheel hub, tire, concrete floor
- Business type: N/A
- Activities: Maintenance or repair
- Text: N/A

--- Image 2 ---
- Setting: Indoor
- Objects: Hand, bolt, metal, thread
- Business type: N/A
- Activities: N/A
- Text: N/A
Review 1:
--- Image 1 ---
- Setting: Outdoor, roadside
- Objects: Four people, a large flag, a van, a building, trees, a road, a utility pole
- Business type: N/A
- Activities: Group photo
- Text: “FROM RECOVERY STATE”

--- Image 2 ---
- Setting: Indoor
- Objects: Sign, purple fabric, logos (Recovery Support, Readdy, Phoenix Recovery, and Quantum Health)
- Business type: N/A
- Activities: N/A
- Text: RECOVERY SUPPORT
Review 2:
--- Image 1 ---
- Setting: Outdoor, ski resort
- Objects: Person, skis, snow, trees, buildings, ski lift
- Business type: N/A
- Activities: Skiing, taking a photo
- Text: N/A

--- Image 2 ---
- Setting: Outdoor, snow-covered landscape
- Objects: Snow, trees, sk

In [None]:
print(pics_test[['user_id', 'Image_Description', 'Key_Words','Is_Advertisement']])

                 user_id Full_Description  \
0  101856865551768948430                    
1  107787438275893909028                    
2  108564695687320799964                    

                                           Key_Words Is_Advertisement  
0  --- Image 1 ---\n- Setting: Outdoor, garage\n-...               No  
1  --- Image 1 ---\n- Setting: Outdoor, roadside\...              Yes  
2  --- Image 1 ---\n- Setting: Outdoor, ski resor...               No  
