In [1]:
import pandas as pd
import requests
import gzip
import json
import io

Read the initial json files

Read Review Data

In [2]:
url = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/review-Vermont_10.json.gz'

response = requests.get(url, stream = True)
response.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response.content), mode = 'rb') as gz_file:
    data_list = [json.loads(line) for line in gz_file]

vt = pd.DataFrame(data_list)

In [3]:
print(vt.shape)
print(vt.columns)
# print(vt.head())

(324725, 8)
Index(['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'], dtype='object')


Read Metadata

In [4]:
url_metadata = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/meta-Vermont.json.gz'

response_metadata = requests.get(url_metadata, stream = True)
response_metadata.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response_metadata.content), mode = 'rb') as gz_file:
    data_list1 = [json.loads(line) for line in gz_file]

vt_metadata = pd.DataFrame(data_list1)

In [5]:
print(vt_metadata.shape)
print(vt_metadata.columns)
# print(vt_metadata.head())

(11291, 15)
Index(['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude',
       'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC',
       'state', 'relative_results', 'url'],
      dtype='object')


Data Cleaning

Keep columns:
Data; 'user_id', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'
Metadata; 'name', 'gmap_id', 'description', 'category', 'url'

Cleaning:
Lower case
Drop duplicates
.str.replace(r'\s+', ' ', regex=True)

Review Data

In [6]:
vt = vt.drop(['name'], axis = 1)
vt.columns = vt.columns.str.lower() 
vt['text'] = vt['text'].str.replace(r'\s+', ' ', regex=True)
vt = vt.drop_duplicates(subset = ['user_id', 'text', 'gmap_id', 'time'])

In [7]:
def collapse_pics(pic_list):
    if not pic_list:
        return []  
    urls = []
    for pic_dict in pic_list:
        urls.extend(pic_dict.get('url', []))
    return urls

vt['pics_collapsed'] = vt['pics'].apply(collapse_pics)
vt = vt.drop('pics', axis = 1)

In [None]:
def extract_texts(resp_str):
    texts = re.findall(r'"text":\s*"([^"]*)"', resp_str)
    return " ".join(texts)

df["resp_text"] = df["resp"].apply(extract_texts)

In [8]:
# print(vt.head())

                 user_id           time  rating  \
0  118026874392842649478  1620085852324       5   
1  101532740754036204131  1580309946474       5   
2  115404122636203550540  1605195974445       5   
3  104789336434407408181  1593005848256       5   
4  108980665975608069965  1582059996120       5   

                                                text  \
0      Always done right from wood stove to screens!   
1  A great company to work with. Their sales and ...   
2  Great place to do business with staff was grea...   
3  Awesome Customer service, quick response, and ...   
4  If you need a top quality job, by a group of p...   

                                                resp  \
0  {'time': 1620087641504, 'text': 'Good Evening,...   
1  {'time': 1580320228721, 'text': 'Good Afternoo...   
2  {'time': 1605195166792, 'text': 'Hi Chad!

Tha...   
3  {'time': 1593376422014, 'text': 'Mark, thank y...   
4  {'time': 1582063833737, 'text': 'Good Afternoo...   

                   