# Assessing the performance of guessing the grand mean accross observations

In [178]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score
from random import sample
import geopy.distance
from geopy import distance
import itertools

noise = pd.read_csv('/Users/Ben/Dropbox/Insight/noisemap/data-merge/noise-merged.csv')

In [23]:
lreg = DummyRegressor(strategy = 'mean')
lreg = lreg.fit(y = noise.db, X = noise.drop('db', axis = 1))

In [30]:
r2_score(noise.db, lreg.predict(noise.drop('db', axis = 1)))

0.0

# Assessing the benefit of finding the quietest park

In [112]:
park_locs = pd.read_csv('/Users/Ben/Dropbox/Insight/noisemap/dash/parks_clean.csv')
park_preds = pd.read_csv('/Users/Ben/Dropbox/Insight/noisemap/dash/parks_out.csv')

def calc_park_dist(user_coord):
    '''
    Calculate distance between user coords and all parks
    '''
    park_dists = []
    all_dists = [geopy.distance.distance(
                (user_coord['lat'], user_coord['lng']),
                (p_lat, p_lng)
                ).m for p_lat, p_lng in zip(park_locs.lat, park_locs.lng)]
    park_out = park_locs.assign(dists = all_dists)
    return park_out

bounds = [[42.23, -71.20], [42.419, -70.95]]

def sample_points(bounds, n): 
    '''
    samples a point from within the bounding box
    '''
    lat_vals = np.random.uniform(bounds[0][0], bounds[1][0], n)
    lng_vals = np.random.uniform(bounds[0][1], bounds[1][1], n)
    
    user_points = [{
        # sample lat range
        'lat': lat,
        # sample lng range
        'lng': lng
    } for lat, lng in zip(lat_vals, lng_vals)
    ]
    
    return user_points

def compare_parks(user_point, hour_bkt, wkday):
    '''
    finds the predicted db level for the quietest park in 2 miles
    and the nearest park
    '''
    # get park dists
    parkdists = calc_park_dist(user_point)
    # filter for parks less than 2 miles
    # 3218.688 m == 2 miles 
    close_parks = parkdists[parkdists.dists < 3218]
    
    if len(close_parks) > 0:
    
        # get park preds
        close_park_preds = park_preds[park_preds.parkname.isin(close_parks.name)]
        # filter for current time
        close_park_preds = close_park_preds[
            (close_park_preds.hr_bkt == hour_bkt) &
            (close_park_preds.wkday == wkday)
        ]

        # find quietest park
        quietest_park = close_park_preds[
            close_park_preds.db == close_park_preds.db.min()
        ].to_dict(orient = 'records')[0]

        # find the closest park
        closest_park_name = parkdists.loc[parkdists.dists == parkdists.dists.min(), 'name'].values[0]
        closest_park = close_park_preds[
            close_park_preds.parkname == closest_park_name
        ].to_dict(orient = 'records')[0]

        return {'closest': closest_park['db'], 'quietest': quietest_park['db']}

In [152]:
wkday = [0, 1]
hour_bkt = {
    0: '12 am - 2 am',
    1: '12 am - 2 am',
    2: '12 am - 2 am',
    3: '3 am - 7 am',
    4: '3 am - 7 am',
    5: '3 am - 7 am',
    6: '3 am - 7 am',
    7: '3 am - 7 am',
    8: '8 am - 10 am',
    9: '8 am - 10 am',
    10: '8 am - 10 am',
    11: '11 am - 1 pm',
    12: '11 am - 1 pm',
    13: '11 am - 1 pm',
    14: '2 pm - 4 pm',
    15: '2 pm - 4 pm',
    16: '2 pm - 4 pm',
    17: '5 pm - 8 pm',
    18: '5 pm - 8 pm',
    19: '5 pm - 8 pm',
    20: '5 pm - 8 pm',
    21: '9 pm - 11 pm',
    22: '9 pm - 11 pm',
    23: '9 pm - 11 pm'
}

day_hour = itertools.product(wkday, hour_bkt.values())

points = sample_points(bounds, 1000)

db_df = pd.DataFrame()
for wkday, hour in day_hour:
    print(str(wkday) + hour)
    for point in points:
        point_db = compare_parks(point, hour, wkday)
        if point_db:
            temp_df = pd.DataFrame({
                'wkday': wkday,
                'hour': hour,
                # these values are scalars so pandas needs them
                # to be wrapped in a list
                'closest_db': point_db['closest'],
                'quietest_db': point_db['quietest']
            },
            index = [0]
            )

            db_df = pd.concat([db_df, temp_df])

db_df = db_df.reset_index(drop = True)

012 am - 2 am
012 am - 2 am
012 am - 2 am
03 am - 7 am
03 am - 7 am
03 am - 7 am
03 am - 7 am
03 am - 7 am
08 am - 10 am
08 am - 10 am
08 am - 10 am
011 am - 1 pm
011 am - 1 pm
011 am - 1 pm
02 pm - 4 pm
02 pm - 4 pm
02 pm - 4 pm
05 pm - 8 pm
05 pm - 8 pm
05 pm - 8 pm
05 pm - 8 pm
09 pm - 11 pm
09 pm - 11 pm
09 pm - 11 pm
112 am - 2 am
112 am - 2 am
112 am - 2 am
13 am - 7 am
13 am - 7 am
13 am - 7 am
13 am - 7 am
13 am - 7 am
18 am - 10 am
18 am - 10 am
18 am - 10 am
111 am - 1 pm
111 am - 1 pm
111 am - 1 pm
12 pm - 4 pm
12 pm - 4 pm
12 pm - 4 pm
15 pm - 8 pm
15 pm - 8 pm
15 pm - 8 pm
15 pm - 8 pm
19 pm - 11 pm
19 pm - 11 pm
19 pm - 11 pm


In [153]:
db_df = db_df.assign(
    diff_yn = lambda x: np.where(x.closest_db == x.quietest_db, 0, 1),
    diff_db = lambda x: x.closest_db - x.quietest_db
)

In [154]:
np.mean(db_df.diff_yn)

0.6905689964157706

In [155]:
[
    np.mean(db_df[db_df.diff_db > 0].closest_db),
    np.mean(db_df[db_df.diff_db > 0].quietest_db)
]

[62.52914685178681, 53.09223375612168]

In [177]:
(np.mean(db_df[db_df.diff_db > 0].closest_db) -
np.mean(db_df[db_df.diff_db > 0].quietest_db))

9.436913095665133

In [156]:
db_df.head()

Unnamed: 0,wkday,hour,closest_db,quietest_db,diff_yn,diff_db
0,0,12 am - 2 am,60.7,60.7,0,0.0
1,0,12 am - 2 am,63.16,60.7,1,2.46
2,0,12 am - 2 am,59.43,54.58,1,4.85
3,0,12 am - 2 am,60.81,60.81,0,0.0
4,0,12 am - 2 am,58.62,58.62,0,0.0


In [175]:
db_df.groupby(['wkday', 'hour']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,closest_db,quietest_db,diff_yn,diff_db
wkday,hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,11 am - 1 pm,61.325097,54.114634,0.680645,7.210462
0,12 am - 2 am,61.447688,54.326699,0.680645,7.120989
0,2 pm - 4 pm,62.813323,56.542957,0.691398,6.270366
0,3 am - 7 am,58.518946,51.738258,0.696774,6.780688
0,5 pm - 8 pm,63.099538,57.454849,0.689247,5.644688
0,8 am - 10 am,60.123151,52.814043,0.675269,7.309108
0,9 pm - 11 pm,62.697624,56.823753,0.702151,5.873871
1,11 am - 1 pm,61.12728,53.998699,0.690323,7.128581
1,12 am - 2 am,61.398376,54.460398,0.682796,6.937978
1,2 pm - 4 pm,62.676161,56.542742,0.693548,6.133419


In [179]:
db_df.to_csv('/Users/Ben/Dropbox/Insight/noisemap/assessment/assessment.csv')