In [12]:
import pandas as pd
import requests
import gzip
import json
import io

Read json file in

In [13]:
url = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/review-Vermont_10.json.gz'
response = requests.get(url, stream = True)
response.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response.content), mode = 'rb') as gz_file:
    data_list = [json.loads(line) for line in gz_file]

vt = pd.DataFrame(data_list)

In [14]:
print(vt.shape)
print(vt.columns)
print(vt.head())

(324725, 8)
Index(['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'], dtype='object')
                 user_id             name           time  rating  \
0  118026874392842649478    rebecca kerns  1620085852324       5   
1  101532740754036204131    Peter DeForge  1580309946474       5   
2  115404122636203550540    Chad Goulette  1605195974445       5   
3  104789336434407408181  Mark LaFountain  1593005848256       5   
4  108980665975608069965           Jeff R  1582059996120       5   

                                                text  pics  \
0      Always done right from wood stove to screens!  None   
1  A great company to work with.  Their sales and...  None   
2  Great place to do business with staff was grea...  None   
3  Awesome Customer service, quick response, and ...  None   
4  If you need a top quality job, by a group of p...  None   

                                                resp  \
0  {'time': 1620087641504, 'text': 'Good Evening,...  

In [45]:
url_metadata = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/meta-Vermont.json.gz'

response_meta = requests.get(url_metadata, stream = True)
response_meta.raise_for_status() 

with gzip.GzipFile(fileobj = io.BytesIO(response_meta.content), mode = 'rb') as gz_file:
    data_list1 = [json.loads(line) for line in gz_file]

vt_business = pd.DataFrame(data_list1)

In [46]:
print(vt_business.head())

                       name  \
0               Royal Group   
1  Foxglove Farm and Forest   
2              Carr's Gifts   
3                     Midas   
4             Keyser Energy   

                                             address  \
0  Royal Group, 150 Woodstock Ave, Rutland, VT 05701   
1  Foxglove Farm and Forest, 777 Delorm Rd, Leice...   
2      Carr's Gifts, 21 Center St, Brandon, VT 05733   
3  Midas, 207 US Route 4 E, Woodstock Ave, Rutlan...   
4  Keyser Energy, 77 Grove St Suite G102, Rutland...   

                                 gmap_id description   latitude  longitude  \
0  0x89e02445cb9db457:0x37f42bff4edf7a43        None  43.615537 -72.960497   
1  0x4cb549e8877cf0d7:0xe8f003e6d73392ae        None  43.855743 -73.088180   
2  0x4cb54a301f3518f7:0x39af4eda1efb9117        None  43.799002 -73.088269   
3  0x89e024446398691f:0x4011d3a0f8636036        None  43.623157 -72.953303   
4  0x89e0247d160d7263:0x32e4f01896e33f3b        None  43.611359 -72.981580   

       

For business data
Drop 'address', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results' column, convert all text to lower case, drop duplicates (around 50 duplicates)

In [47]:
vt_business = vt_business.drop(['address', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results'], axis = 1)
vt_business.columns = vt_business.columns.str.lower()
print(vt_business.head())
print(vt_business.shape)

                       name                                gmap_id  \
0               Royal Group  0x89e02445cb9db457:0x37f42bff4edf7a43   
1  Foxglove Farm and Forest  0x4cb549e8877cf0d7:0xe8f003e6d73392ae   
2              Carr's Gifts  0x4cb54a301f3518f7:0x39af4eda1efb9117   
3                     Midas  0x89e024446398691f:0x4011d3a0f8636036   
4             Keyser Energy  0x89e0247d160d7263:0x32e4f01896e33f3b   

  description                                           category  \
0        None  [Security system supplier, Fire protection equ...   
1        None                  [Indoor lodging, Farm, Gift shop]   
2        None                                        [Gift shop]   
3        None  [Auto repair shop, Brake shop, Muffler shop, O...   
4        None  [Heating oil supplier, Air conditioning contra...   

                                                 url  
0  https://www.google.com/maps/place//data=!4m2!3...  
1  https://www.google.com/maps/place//data=!4m2!3...  
2  ht

In [48]:
vt_business = vt_business.drop_duplicates(subset=['name','gmap_id'])
print(vt_business.shape)

(11243, 5)


For Review data
Drop 'name' column, convert all text to lower case, drop duplicates (~3000 duplicates)

In [17]:
vt = vt.drop(['name'], axis = 1)
vt.columns = vt.columns.str.lower() 
vt = vt.drop_duplicates(subset=['user_id', 'text', 'gmap_id'])
print(vt.shape)

(321483, 7)


Change the format of the 'pics' column, merge multiple dictionaries into one with list of pic urls

In [55]:
# vt_pics = vt[vt['pics'].notna()]
# print(vt_pics.head())
# print(vt_pics['pics'])
# import pprint
# pprint.pprint(vt_pics.loc[17, 'pics'])

In [58]:
def collapse_pics(pic_list):
    if not pic_list:
        return []  
    urls = []
    for pic_dict in pic_list:
        urls.extend(pic_dict.get('url', []))
    return urls

vt['pics_collapsed'] = vt['pics'].apply(collapse_pics)

# print(vt['pics_collapsed'].iloc[17])

Average rating based on rating-only file

In [None]:
average_ratings = vt.groupby('gmap_id')['rating'].mean()
vt['average_rating'] = vt['gmap_id'].map(average_ratings)
print(vt.head())

rating                                  1   2   3   4    5   average
gmap_id                                                             
0x4054251f453dde0d:0x6fb60c932a35686    0   0   1   6   12  4.578947
0x405426dbc2683d73:0x5540daa9269c9598   1   1   1   6   18  4.444444
0x4056961885a0f575:0x9a02c2199f695127   0   2   0   4   17  4.565217
0x40569854833a734b:0xa0aa4d4cc55b5ac   12  10  38  99  113  4.069853
0x405726108eb74aa1:0x7acdaadaff59d35c   5   0   1   5   34  4.400000


Separate dataset into vt1 (rating only) and vt2 (everything else)

In [None]:
vt1 = vt[vt['pics'].isna() & vt['text'].isna()] # rating only
vt2 = vt[~vt.index.isin(vt1.index)] # everything else

print(vt1.shape)
print(vt2.shape)

(145565, 8)
(175918, 8)


Data Exploration