In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import os
import time
import random

# Read scraped data

## Listings

In [7]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,locationId,name,url,types,reviews,City,State
0,2556749,Cafe Bella Coffee,/Restaurant_Review-g47177-d2556749-Reviews-Caf...,"['Coffee & Tea', 'Quick Bites']",37,Rio Rancho,New Mexico
1,12361344,Coffee Central,/Restaurant_Review-g44028-d12361344-Reviews-Co...,['Coffee & Tea'],2,Southaven,Mississippi
2,498652,Perugino,/Restaurant_Review-g51862-d498652-Reviews-Peru...,['Coffee & Tea'],67,Eugene,Oregon
3,2343459,Vero Espresso,/Restaurant_Review-g51862-d2343459-Reviews-Ver...,"['Coffee & Tea', 'Quick Bites']",54,Eugene,Oregon
4,1014118,Espresso Roma,/Restaurant_Review-g51862-d1014118-Reviews-Esp...,"['Coffee & Tea', 'Quick Bites']",21,Eugene,Oregon
...,...,...,...,...,...,...,...
4382,27113108,Grizzly Bean Coffee House,/Restaurant_Review-g29948-d27113108-Reviews-Gr...,['Coffee & Tea'],1,Youngstown,Ohio
4383,10507418,Classic Rock Coffee & Kitchen,/Restaurant_Review-g49882-d10507418-Reviews-Cl...,['Coffee & Tea'],3,Mandan,North Dakota
4384,4824290,Cappuccino On Collins,/Restaurant_Review-g49882-d4824290-Reviews-Cap...,['Coffee & Tea'],3,Mandan,North Dakota
4385,7359734,The Bean & Bakery,/Restaurant_Review-g46052-d7359734-Reviews-The...,"['Coffee & Tea', 'Quick Bites']",21,Concord,New Hampshire


## Shops

In [8]:

ratings_breakdowns = pd.DataFrame(columns=['locationId', 'Excellent', 'Very_Good', 
                                           'Average', 'Poor', 'Terrible'])

json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

In [9]:
for file in json_files:
    with open(os.path.join(folder_path, file), 'r') as json_file:
        json_data = json.load(json_file)
        locationId = json_data[0]['data']['locations'][0]['locationId']
        Excellent = json_data[0]['data']['locations'][0]['reviewAggregations']['ratingCounts'][4]
        Very_Good = json_data[0]['data']['locations'][0]['reviewAggregations']['ratingCounts'][3]
        Average = json_data[0]['data']['locations'][0]['reviewAggregations']['ratingCounts'][2]
        Poor = json_data[0]['data']['locations'][0]['reviewAggregations']['ratingCounts'][1]
        Terrible = json_data[0]['data']['locations'][0]['reviewAggregations']['ratingCounts'][0]
    
    # Append to the DataFrame
        ratings_breakdowns = ratings_breakdowns.append({
                'locationId': locationId,
                'Excellent': Excellent,
                'Very_Good': Very_Good,
                'Average' : Average,
                'Poor': Poor,
                'Terrible': Terrible
                }, ignore_index=True)

In [10]:
ratings_breakdowns

Unnamed: 0,locationId,Excellent,Very_Good,Average,Poor,Terrible
0,870194,23,0,0,0,0
1,24961448,1,1,1,0,0
2,7255824,8,11,1,0,0
3,18936591,7,3,1,1,0
4,17463543,7,0,0,0,0
...,...,...,...,...,...,...
4374,24108785,1,0,0,0,0
4375,4892729,24,21,11,3,2
4376,4947101,8,1,0,0,0
4377,14789744,5,2,0,0,0


In [11]:
data = data.merge(ratings_breakdowns, on=['locationId'], how='left')

data

Unnamed: 0,locationId,name,url,types,reviews,City,State,Excellent,Very_Good,Average,Poor,Terrible
0,2556749,Cafe Bella Coffee,/Restaurant_Review-g47177-d2556749-Reviews-Caf...,"['Coffee & Tea', 'Quick Bites']",37,Rio Rancho,New Mexico,18,7,3,5,4
1,12361344,Coffee Central,/Restaurant_Review-g44028-d12361344-Reviews-Co...,['Coffee & Tea'],2,Southaven,Mississippi,1,0,0,1,0
2,498652,Perugino,/Restaurant_Review-g51862-d498652-Reviews-Peru...,['Coffee & Tea'],67,Eugene,Oregon,41,22,2,2,0
3,2343459,Vero Espresso,/Restaurant_Review-g51862-d2343459-Reviews-Ver...,"['Coffee & Tea', 'Quick Bites']",54,Eugene,Oregon,38,12,4,0,0
4,1014118,Espresso Roma,/Restaurant_Review-g51862-d1014118-Reviews-Esp...,"['Coffee & Tea', 'Quick Bites']",21,Eugene,Oregon,12,7,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4382,27113108,Grizzly Bean Coffee House,/Restaurant_Review-g29948-d27113108-Reviews-Gr...,['Coffee & Tea'],1,Youngstown,Ohio,1,0,0,0,0
4383,10507418,Classic Rock Coffee & Kitchen,/Restaurant_Review-g49882-d10507418-Reviews-Cl...,['Coffee & Tea'],3,Mandan,North Dakota,2,1,0,0,0
4384,4824290,Cappuccino On Collins,/Restaurant_Review-g49882-d4824290-Reviews-Cap...,['Coffee & Tea'],3,Mandan,North Dakota,1,1,1,0,0
4385,7359734,The Bean & Bakery,/Restaurant_Review-g46052-d7359734-Reviews-The...,"['Coffee & Tea', 'Quick Bites']",21,Concord,New Hampshire,10,10,0,1,0


In [22]:
len(data.City.unique())

422

# Calculate the weighted average score across all independent coffee shops in each city

In [12]:
# Assign numeric ratings
ratings = {
    'Excellent': 5,
    'Very_Good': 4,
    'Average': 3,
    'Poor': 2,
    'Terrible': 1
}

In [13]:
data['total_score'] = (data['Excellent'] * ratings['Excellent'] +
                     data['Very_Good'] * ratings['Very_Good'] +
                     data['Average'] * ratings['Average'] +
                     data['Poor'] * ratings['Poor'] +
                     data['Terrible'] * ratings['Terrible'])

In [14]:
# Calculate the weighted average rating for each shop
data['weighted_average'] = data['total_score'] / data['reviews']

In [15]:
# Calculate the weighted average across coffee shops in each city and state, and count the number of coffee shops
city_state_weighted_avg = data[~pd.isna(data['weighted_average'])].groupby(['City', 'State']).apply(
    lambda x: pd.Series({
        'city_state_weighted_average': (x['weighted_average'] * x['reviews']).sum() / x['reviews'].sum(),
        'coffee_shops_number': x.shape[0]
    })
).reset_index()

In [16]:
city_state_weighted_avg.sort_values(by='city_state_weighted_average', ascending=False)

Unnamed: 0,City,State,city_state_weighted_average,coffee_shops_number
167,Grand Island,Nebraska,5.000000,2.0
342,Rexburg,Idaho,5.000000,1.0
291,Newton,Massachusetts,5.000000,2.0
304,Olathe,Kansas,5.000000,3.0
399,Springfield,Oregon,5.000000,2.0
...,...,...,...,...
351,Rockville Montgomery County,Maryland,3.730769,3.0
219,Lafayette,Indiana,3.585366,5.0
112,Dickinson,North Dakota,3.529412,2.0
390,Southaven,Mississippi,3.500000,1.0


In [23]:
city_state_weighted_avg['coffee_shops_number'].describe()

count    447.000000
mean       9.794183
std       18.874917
min        1.000000
25%        2.000000
50%        5.000000
75%       10.000000
max      232.000000
Name: coffee_shops_number, dtype: float64

In [18]:
#df = city_state_weighted_avg[city_state_weighted_avg['coffee_shops_number'] >=10] 

In [19]:
#df

Unnamed: 0,City,State,city_state_weighted_average,coffee_shops_number
3,Albuquerque,New Mexico,4.300518,30.0
4,Alexandria,Virginia,4.132841,12.0
7,Anchorage,Alaska,4.500000,25.0
9,Ann Arbor,Michigan,4.400000,11.0
13,Arlington,Virginia,4.399340,10.0
...,...,...,...,...
419,Tulsa,Oklahoma,4.269231,11.0
423,Vancouver,Washington,4.328904,19.0
424,Virginia Beach,Virginia,4.417559,15.0
426,Washington DC,District of Columbia,4.415318,45.0


In [24]:
data.to_csv('data_after_parsing.csv', index=False)
city_state_weighted_avg.to_csv('city_state_weighted_avg.csv', index=False)