In [20]:
import pandas as pd
import requests
import gzip
import json
import io
import numpy as np
import re

Read the initial json files

Read Review Data

In [2]:
url = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/review-Vermont_10.json.gz'

response = requests.get(url, stream = True)
response.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response.content), mode = 'rb') as gz_file:
    data_list = [json.loads(line) for line in gz_file]

vt = pd.DataFrame(data_list)

In [3]:
print(vt.shape)
print(vt.columns)
# print(vt.head())

(324725, 8)
Index(['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'], dtype='object')


Read Metadata

In [4]:
url_metadata = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/meta-Vermont.json.gz'

response_metadata = requests.get(url_metadata, stream = True)
response_metadata.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response_metadata.content), mode = 'rb') as gz_file:
    data_list1 = [json.loads(line) for line in gz_file]

vt_metadata = pd.DataFrame(data_list1)

In [5]:
print(vt_metadata.shape)
print(vt_metadata.columns)
# print(vt_metadata.head())

(11291, 15)
Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')


Data Cleaning

Keep columns:
Data; 'user_id', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'
Metadata; 'name', 'gmap_id', 'description', 'category', 'url'

Cleaning:
Lower case
Drop duplicates
.str.replace(r'\s+', ' ', regex=True)

Review Data

In [None]:
vt = vt.drop('name', axis = 1)
vt.columns = vt.columns.str.lower() 
vt['text'] = vt['text'].str.replace(r'\s+', ' ', regex = True)
vt = vt.drop_duplicates(subset = ['user_id', 'text', 'gmap_id', 'time'])

In [7]:
def collapse_pics(pic_list):
    if not pic_list:
        return []  
    urls = []
    for pic_dict in pic_list:
        urls.extend(pic_dict.get('url', []))
    return urls

vt['pics_collapsed'] = vt['pics'].apply(collapse_pics)
vt = vt.drop('pics', axis = 1)

In [None]:
def extract_texts(resp_entry):
    if isinstance(resp_entry, dict):
        # single response dict
        return resp_entry.get("text", "")
    elif isinstance(resp_entry, list):
        # list of response dicts
        return " ".join([d.get("text", "") for d in resp_entry if isinstance(d, dict)])
    elif isinstance(resp_entry, str):
        # fallback: extract with regex if it's a string
        texts = re.findall(r'"text":\s*"([^"]*)"', resp_entry)
        return " ".join(texts)
    else:
        return ""
    
vt["resp_collapsed"] = vt["resp"].apply(extract_texts)
vt = vt.drop("resp", axis = 1)

In [9]:
print(vt.head())

                 user_id           time  rating  \
0  118026874392842649478  1620085852324       5   
1  101532740754036204131  1580309946474       5   
2  115404122636203550540  1605195974445       5   
3  104789336434407408181  1593005848256       5   
4  108980665975608069965  1582059996120       5   

                                                text  \
0      Always done right from wood stove to screens!   
1  A great company to work with. Their sales and ...   
2  Great place to do business with staff was grea...   
3  Awesome Customer service, quick response, and ...   
4  If you need a top quality job, by a group of p...   

                                 gmap_id pics_collapsed  \
0  0x89e02445cb9db457:0x37f42bff4edf7a43             []   
1  0x89e02445cb9db457:0x37f42bff4edf7a43             []   
2  0x89e02445cb9db457:0x37f42bff4edf7a43             []   
3  0x89e02445cb9db457:0x37f42bff4edf7a43             []   
4  0x89e02445cb9db457:0x37f42bff4edf7a43             []   

 

Metadata 

In [None]:
vt_metadata = vt_metadata.drop(['address', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results'], axis = 1)
vt_metadata.columns = vt_metadata.columns.str.lower() 
vt_metadata['description'] = vt_metadata['description'].str.replace(r'\s+', ' ', regex = True)
vt_metadata['category'] = vt_metadata['category'].str.replace(r'\s+', ' ', regex = True)
vt_metadata = vt_metadata.drop_duplicates(subset = ['name','gmap_id', 'gmap_id'])

In [11]:
print(vt_metadata.head())

                       name                                gmap_id  \
0               Royal Group  0x89e02445cb9db457:0x37f42bff4edf7a43   
1  Foxglove Farm and Forest  0x4cb549e8877cf0d7:0xe8f003e6d73392ae   
2              Carr's Gifts  0x4cb54a301f3518f7:0x39af4eda1efb9117   
3                     Midas  0x89e024446398691f:0x4011d3a0f8636036   
4             Keyser Energy  0x89e0247d160d7263:0x32e4f01896e33f3b   

  description  category                                                url  
0        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  
1        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  
2        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  
3        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  
4        None       NaN  https://www.google.com/maps/place//data=!4m2!3...  


Convert Metadata into csv for scraping: 
- scrape our own data to fill in blanks for description
- scrape our own data to fill in blanks for category

In [12]:
vt_metadata.to_csv('vt_metadata.csv')

Scraping Process ...

Merge the Review data and metadata by gmap_id

In [None]:
vt_merged = pd.merge(vt, vt_metadata, on = 'gmap_id', how = 'inner')
vt_merged = vt_merged.drop('gmap_id', axis = 1)
vt_merged['review_id'] = range(len(vt_merged))
column = vt_merged.pop('review_id')
vt_merged.insert(0, 'review_id', column)


   review_id                user_id           time  rating  \
0          0  118026874392842649478  1620085852324       5   
1          1  101532740754036204131  1580309946474       5   

                                                text pics_collapsed  \
0      Always done right from wood stove to screens!             []   
1  A great company to work with. Their sales and ...             []   

                                      resp_collapsed         name description  \
0  Good Evening, Rebecca! Thanks SO much for the ...  Royal Group        None   
1  Good Afternoon, Peter - Really appreciate the ...  Royal Group        None   

   category                                                url  
0       NaN  https://www.google.com/maps/place//data=!4m2!3...  
1       NaN  https://www.google.com/maps/place//data=!4m2!3...  


In [None]:
vt_merged['image_Keywords'] = np.nan
vt_merged['is_image_ad'] = np.nan
vt_merged['is_image_irrelevant'] = np.nan
vt_merged['text_keywords'] = np.nan
vt_merged['is_text_ad'] = np.nan
vt_merged['is_text_irrelevant'] = np.nan
vt_merged['is_text_rant'] = np.nan

print(vt_merged.columns)

   review_id                user_id           time  rating  \
0          0  118026874392842649478  1620085852324       5   
1          1  101532740754036204131  1580309946474       5   
2          2  115404122636203550540  1605195974445       5   
3          3  104789336434407408181  1593005848256       5   

                                                text pics_collapsed  \
0      Always done right from wood stove to screens!             []   
1  A great company to work with. Their sales and ...             []   
2  Great place to do business with staff was grea...             []   
3  Awesome Customer service, quick response, and ...             []   

                                      resp_collapsed         name description  \
0  Good Evening, Rebecca! Thanks SO much for the ...  Royal Group        None   
1  Good Afternoon, Peter - Really appreciate the ...  Royal Group        None   
2  Hi Chad!\n\nThank you so much for the 5-Star r...  Royal Group        None   
3  Mark, 

In [21]:
vt_merged.to_csv('vt_merged.csv')

Sample dataset (add labelling columns)

In [2]:
!pip3 install -q -U transformers
!pip3 install -q -U torch
!pip3 install -q -U huggingface_hub
!pip3 install -q -U dotenv

In [3]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

HTTPError: Invalid user token. The token from HF_TOKEN environment variable is invalid. Note that HF_TOKEN takes precedence over `hf auth login`.

In [None]:
from transformers import pipeline
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

model_id = "google/gemma-3-4b-it"

pipe = pipeline(
    "image-text-to-text",
    model=model_id,
    device=device,          
    torch_dtype=torch.bfloat16
)

For is_image_ad column