In [1]:
import pandas as pd
import requests
import gzip
import json
import io

Read json file in

In [2]:
url = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/review-Vermont_10.json.gz'

response = requests.get(url, stream = True)
response.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response.content), mode = 'rb') as gz_file:
    data_list = [json.loads(line) for line in gz_file]

vt = pd.DataFrame(data_list)

In [3]:
print(vt.shape)
print(vt.columns)
# print(vt.head())

(324725, 8)
Index(['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'], dtype='object')


In [4]:
url_metadata = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/meta-Vermont.json.gz'

response_metadata = requests.get(url_metadata, stream = True)
response_metadata.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response_metadata.content), mode = 'rb') as gz_file:
    data_list1 = [json.loads(line) for line in gz_file]

vt_metadata = pd.DataFrame(data_list1)

In [5]:
print(vt_metadata.shape)
print(vt_metadata.columns)
# print(vt_metadata.head())

(11291, 15)
Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')


Data Cleaning

For business data
Drop 'address', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results' column, convert all text to lower case, drop duplicates (around 50 duplicates)

In [6]:
print(vt_metadata.shape)
print(vt_metadata.columns)
vt_metadata = vt_metadata.drop(['address', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results'], axis = 1)
vt_metadata.columns = vt_metadata.columns.str.lower() 
vt_metadata['description'] = vt_metadata['description'].str.replace(r'\s+', ' ', regex=True)
vt_metadata['category'] = vt_metadata['category'].str.replace(r'\s+', ' ', regex=True)
vt_metadata = vt_metadata.drop_duplicates(subset = ['name','gmap_id', 'gmap_id'])

(11291, 15)
Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')


For Review data drop 'name' column, convert all text to lower case, drop duplicates (~3000 duplicates)

In [7]:
vt = vt.drop(['name', 'resp'], axis = 1)
vt.columns = vt.columns.str.lower() 
vt['text'] = vt['text'].str.replace(r'\s+', ' ', regex=True)
vt = vt.drop_duplicates(subset = ['user_id', 'text', 'gmap_id', 'time'])

Change the format of the 'pics' column, merge multiple dictionaries into one with list of pic urls

In [8]:
# vt_pics = vt[vt['pics'].notna()]
# print(vt_pics.head())
# print(vt_pics['pics'])
# import pprint
# pprint.pprint(vt_pics.loc[17, 'pics'])

In [9]:
def collapse_pics(pic_list):
    if not pic_list:
        return []  
    urls = []
    for pic_dict in pic_list:
        urls.extend(pic_dict.get('url', []))
    return urls

vt['pics_collapsed'] = vt['pics'].apply(collapse_pics)

# print(vt['pics_collapsed'].iloc[17])

Merge by gmap id

In [10]:
vt_merged = pd.merge(vt, vt_metadata, on = 'gmap_id', how = 'inner')

Separate dataset into vt1 (rating only) and vt2 (everything else)

In [11]:
vt1 = vt_merged[vt_merged['pics'].isna() & vt_merged['text'].isna()] # rating only
vt2 = vt_merged[~vt_merged.index.isin(vt1.index)] # everything else
vt1 = vt1.drop(['pics'], axis = 1)
vt2 = vt2.drop(['pics'], axis = 1)
print(vt1.shape)
print(vt2.shape)

(145565, 10)
(175918, 10)


In [12]:
# vt1.to_csv('vermont_merged_rating only.csv')
vt2.to_csv('vermont_merged_text_and_or_review.csv')
# vt.to_csv('vermont.csv')
# vt_merged.to_csv('vermont_merged.csv')
# vt_metadata.to_csv('vermont_metadata.csv')

Detecting Advertisement from image

In [13]:
!pip3 install -q -U transformers
!pip3 install -q -U torch
!pip3 install -q -U huggingface_hub
!pip3 install -q -U dotenv

In [14]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [15]:
from transformers import pipeline
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")


model_id = "google/gemma-3-4b-it"

pipe = pipeline(
    "image-text-to-text",
    model=model_id,
    device=device,          
    torch_dtype=torch.bfloat16
)

Using device: mps


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use mps


In [16]:
messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": (
                "You are a potential customer looking at Google review images. "
                # "Always respond in two clearly labeled sections:\n"
                # "Answer: <short, factual yes/no/N/A>\n"
                # "Rationale: <explain based on visible features>"
            )}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://lh5.googleusercontent.com/p/AF1QipMBzN4BJV9YCObcw_ifNzFPm-u38hO3oimOA8Fb=w150-h150-k-no-p"},
            {"type": "text", "text": (
                "Describe the image in detail."
            )}
        ]
    }
]

output = pipe(text=messages, max_new_tokens=128)
print(output[0]["generated_text"][-1]["content"])

Okay, here’s a detailed description of the image I’m seeing:

**Overall Impression:** The image shows the reception or waiting area of what appears to be a modern business or service location. It has a clean, somewhat minimalist aesthetic.

**Key Elements:**

*   **Ceiling:** The ceiling is a light grey with a slight textured finish. There’s a noticeable angled section, suggesting a vaulted or slightly raised ceiling. Several recessed lights are also visible.
*   **Lighting:** The most striking feature is a set of four pendant lights hanging from the ceiling. They have a modern, geometric design with a


In [26]:
data = pd.read_csv('vermont_merged_text_and_or_review.csv')
data = data.drop(data.columns[0], axis = 1)
print(data.columns)
print(data.head())

Index(['user_id', 'time', 'rating', 'text', 'gmap_id', 'pics_collapsed',
       'name', 'description', 'category', 'url'],
      dtype='object')
                 user_id           time  rating  \
0  118026874392842649478  1620085852324       5   
1  101532740754036204131  1580309946474       5   
2  115404122636203550540  1605195974445       5   
3  104789336434407408181  1593005848256       5   
4  108980665975608069965  1582059996120       5   

                                                text  \
0      Always done right from wood stove to screens!   
1  A great company to work with. Their sales and ...   
2  Great place to do business with staff was grea...   
3  Awesome Customer service, quick response, and ...   
4  If you need a top quality job, by a group of p...   

                                 gmap_id pics_collapsed         name  \
0  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Group   
1  0x89e02445cb9db457:0x37f42bff4edf7a43             []  Royal Grou