# Impute missing skims for Bay  Area 
Raw beam skims have missing values for some OD pairs. This notebook imputes missing skims based on the MTC skims 

In [9]:
# Approaches: 
# 1. I could do what Max did. 
# 2. I could copy the missing values form the skims.OMX 
# It seems that the two approches are the same 
# Let's do both and ask tomorrow in the meeting. 

In [3]:
import pandas as pd 
import numpy as np

In [4]:
beam_raw_skims = pd.read_csv('s3://baus-data/spring_2019/30.skims-smart-23April2019-baseline.csv.gz')

In [8]:
mtc_skims = pd.read_csv('s3://baus-data/spring_2019/mtc_skims.csv', index_col = 0)

  mask |= (ar1 == a)


In [11]:
mtc_skims.head()

Unnamed: 0,orig,dest,da_distance_EA,daToll_distance_EA,s2_distance_EA,s2Toll_distance_EA,s3_distance_EA,s3Toll_distance_EA,walk_distance_EA,bike_distance_EA,...,daToll_Time_EV,s2_Time_EV,s2Toll_Time_EV,s3_Time_EV,s3Toll_Time_EV,walk_Time_EV,bike_Time_EV,wTrnW_Time_EV,dTrnW_Time_EV,wTrnD_Time_EV
0,1,1,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,...,5.8,5.8,5.8,5.8,5.8,2.4,0.6,9999.0,9999.0,9999.0
1,1,2,0.24,0.24,0.24,0.24,0.24,0.24,0.24,0.24,...,6.13,6.13,6.13,6.13,6.13,4.8,1.2,8.67,6.97,4.93
2,1,3,0.44,0.44,0.44,0.44,0.44,0.44,0.44,0.44,...,6.58,6.58,6.58,6.58,6.58,8.8,2.2,12.7,9.02,5.53
3,1,4,0.41,0.41,0.41,0.41,0.41,0.41,0.41,0.41,...,6.62,6.62,6.62,6.62,6.62,8.2,2.05,7.1,5.4,5.4
4,1,5,0.68,0.68,0.68,0.68,0.68,0.68,0.68,0.68,...,7.27,7.27,7.27,7.27,7.27,13.6,3.4,9.11,7.13,5.85


In [None]:
def impute_missing_skims(mtc_skims, beam_skims_raw):
    df = beam_skims_raw

    # seconds to minutes
    df['gen_tt'] = df['generalizedTimeInS'] / 60 #Better to keep seconds 

    mtc = mtc_skims['orig', 'dest', 'da_distance_AM']
    mtc.rename(
        columns={'orig': 'from_zone_id', 'dest': 'to_zone_id'},
        inplace=True)
    mtc.set_index(['from_zone_id', 'to_zone_id'], inplace=True)

    # miles to meters
    mtc['dist'] = mtc['da_distance_AM'] * 1609.34

    # impute mtc zone-to-zone distances where zero-valued in beam skims
    if len(df.loc[df['distanceInM'] == 0, 'distanceInM']) > 0:
        df.loc[df['distanceInM'] == 0, 'distanceInM'] = mtc.loc[
            pd.MultiIndex.from_frame(df.loc[df['distanceInM'] == 0, [
                'from_zone_id', 'to_zone_id']]), 'dist'].values

    # use MTC dists for all intra-taz distances
    intra_taz_mask = df['from_zone_id'] == df['to_zone_id']
    df.loc[intra_taz_mask, 'distanceInM'] = mtc.loc[pd.MultiIndex.from_frame(
        df.loc[intra_taz_mask, ['from_zone_id', 'to_zone_id']]), 'dist'].values


    # create morning peak lookup
    df['gen_time_per_m'] = df['gen_tt'] / df['distanceInM']
    df['gen_cost_per_m'] = df['gen_cost'] / df['distanceInM']
    df.loc[df['hour'].isin([7, 8, 9]), 'period'] = 'AM'
    df_am = df[df['period'] == 'AM']
    df_am = df_am.replace([np.inf, -np.inf], np.nan)
    df_am = df_am.loc[df_am.index.repeat(df_am.numObservations)]  # weighted
    am_lookup = df_am[[
        'mode', 'gen_time_per_m', 'gen_cost_per_m']].dropna().groupby(
            ['mode']).mean().reset_index()

    # morning averages
    df_am_avg = df_am[[
        'from_zone_id', 'to_zone_id', 'mode', 'gen_tt',
        'gen_cost']].groupby(
        ['from_zone_id', 'to_zone_id', 'mode']).mean().reset_index()

    # long to wide
    df_am_pivot = df_am_avg.pivot_table(
        index=['from_zone_id', 'to_zone_id'], columns='mode')
    df_am_pivot.columns = ['_'.join(col) for col in df_am_pivot.columns.values]

    # combine with mtc-based dists
    merged = pd.merge(
        mtc[['dist']], df_am_pivot, left_index=True, right_index=True,
        how='left')

    # impute
    for mode in am_lookup['mode'].values:
        for impedance in ['gen_tt', 'gen_cost']:
            if impedance == 'gen_tt':
                lookup_col = 'gen_time_per_m'
            elif impedance == 'gen_cost':
                lookup_col = 'gen_cost_per_m'
            colname = impedance + '_' + mode
            lookup_val = am_lookup.loc[
                am_lookup['mode'] == mode, lookup_col].values[0]
            merged.loc[pd.isnull(merged[colname]), colname] = merged.loc[
                pd.isnull(merged[colname]), 'dist'] * lookup_val

    assert len(merged) == 2114116

    return merged