- produce csv output summary of daysim results for use in tableau and other sources

In [448]:
import os
import pandas as pd
import h5py
% matplotlib inline

In [447]:
def h5_to_df(h5file, table_list, name=False):
    """
    Load h5-formatted data based on a table list. Assumes heirarchy of a set of tables.
    """
    output_dict = {}
    
    for table in table_list:
        df = pd.DataFrame()
        for field in h5file[table].keys():
            df[field] = h5file[table][field][:]
            
        output_dict[table] = df
    
    if name:
        output_dict['name'] = name
    
    return output_dict

In [491]:
def add_row(df, row_name, description, value):
    df.ix[row_name,'description'] = description
    df.ix[row_name,'value'] = value
    
    return df

In [197]:
labels = pd.read_csv(r'variable_labels.csv')
districts = pd.read_csv(r'data/district_lookup.csv')

In [458]:
# list of runs to add to the analysis, to come from the script argument (?)

In [545]:
model_runs = [r'R:\SoundCast\releases\TransportationFutures2010',
              r'R:\SoundCast\releases\soundcast_release_c1']

# model_runs = [r'R:\SoundCast\releases\TransportationFutures2010']

# Assume standard path for survey, which can be overridden as arg
survey_dir = r'R:\SoundCast\Inputs\2014\etc\survey.h5'

output_dir = r'J:\projects\soundcast\soundcast_dashboard\model_output'

survey_added = False

for model_dir in model_runs:

    # Process daysim results
    
    daysim_h5 = h5py.File(os.path.join(model_dir,r'outputs/daysim_outputs.h5'))
    # name of scenario is last level of directory
    scenario_name = os.path.basename(model_dir)
    
    print 'processing ' + scenario_name
    
    # Perform calculations and export to csv
    process_dataset(h5file=daysim_h5, scenario_name=scenario_name)
    
    del daysim_h5 # drop from memory to save space for next comparison
    
    if not survey_added:
        print 'processing: survey'
        
        survey_h5 = h5py.File(survey_dir)
        process_dataset(h5file=survey_h5, scenario_name='survey')
        
        survey_added = True
        
    # Process network results
    network_df = network_results(model_dir, dataset_name=scenario_name)
    write_csv(network_df, fname='network_summary.csv')

processing TransportationFutures2010
processing: survey
processing soundcast_release_c1


In [486]:
def apply_lables(h5data):
    '''
    Replace daysim formatted values with human readable lablels.
    '''
    for table in labels['table'].unique():
        df = labels[labels['table'] == table]
        for field in df['field'].unique():
            newdf = df[df['field'] == field]
            local_series = pd.Series(newdf['text'].values, index=newdf['value'])
            h5data[table][field] = h5data[table][field].map(local_series)
    
    return h5data

In [518]:
def process_dataset(h5file, scenario_name):
    
    # Process all daysim results
    
    # Load h5 data as dataframes
    ### ONLY for surveys first FIX THIS
    dataset = h5_to_df(h5file, table_list=['Household','Trip','Tour','Person','HouseholdDay'], name=scenario_name)

    dataset = apply_lables(dataset)
    
    # Calculate aggregate measures csv
    agg_df = agg_measures(dataset)
    write_csv(agg_df,fname='agg_measures.csv')
    
    purpose_df = purpose_summary(dataset)
    write_csv(purpose_df,fname='purpose.csv')
    
    mode_df = mode_summary(dataset)
    write_csv(mode_df,fname='mode.csv')
    
    district_df = district_summary(dataset)
    write_csv(district_df,fname='district.csv')
    

In [540]:
def write_csv(df,fname):
    '''
    Write dataframe to file; append existing file
    '''
#     df.to_csv(os.path.join(output_dir,fname),mode='a')
    if not os.path.isfile(os.path.join(output_dir,fname)):
        df.to_csv(os.path.join(output_dir,fname))
    else: # append without writing the header
        df.to_csv(os.path.join(output_dir,fname), mode ='a', header=False)

In [504]:
def agg_measures(dataset):
    df = pd.DataFrame()

    # Total Persons
    df = add_row(df, row_name='total_persons', description='Total Persons', value=dataset['Person']['psexpfac'].sum())

    # Total Households
    df = add_row(df, row_name='total_hhs', description='Total Households', value=dataset['Household']['hhexpfac'].sum())

    # Average Household Size
    avg_hh_size = (dataset['Household']['hhsize']*dataset['Household']['hhexpfac']).sum()/dataset['Household']['hhexpfac'].sum()
    df = add_row(df, row_name='avg_hh_size', description='Average Household Size', value=avg_hh_size)

    # Average Trips per Person
    trips_per_person = dataset['Trip']['trexpfac'].sum()/dataset['Person']['psexpfac'].sum()
    df = add_row(df, row_name='trips_per_person', description='Average Trips per Person', value=trips_per_person)

    # Average Trip Length
    trip_len = (dataset['Trip']['travdist']*dataset['Trip']['trexpfac']).sum()/dataset['Trip']['trexpfac'].sum()
    df = add_row(df, row_name='trip_len', description='Average Trips Length', value=trip_len)

    # VMT per capita
    driver_trips = dataset['Trip'][dataset['Trip']['dorp'] == 'Driver']
    vmt_per_cap = (driver_trips['travdist']*driver_trips['trexpfac']).sum()/dataset['Person']['psexpfac'].sum()
    df = add_row(df, row_name='vmt_per_cap', description='VMT per Person', value=vmt_per_cap)

    # Average distance to work
    to_work_tours = dataset['Tour'][dataset['Tour']['pdpurp'] == 'Work']
    dist_to_work = (to_work_tours['tautodist']*to_work_tours['toexpfac']).sum()/to_work_tours['toexpfac'].sum()
    df = add_row(df, row_name='dist_to_work', description='Avg Distance to Work', value=dist_to_work)

    # Average distance to school
    to_school_tours = dataset['Tour'][dataset['Tour']['pdpurp'] == 'School']
    dist_to_school = (to_school_tours['tautodist']*to_school_tours['toexpfac']).sum()/to_school_tours['toexpfac'].sum()
    df = add_row(df, row_name='dist_to_school', description='Avg Distance to School', value=dist_to_school)
    
    # add datasource field
    df['source'] = dataset['name']
    
    return df

In [542]:
def purpose_summary(dataset):
    tour = dataset['Tour']
    trip = dataset['Trip']
    hh = dataset['Household']
    person = dataset['Person']
    
    # Tour distance by purpose
    tour['tautodist_wt'] = tour['toexpfac']*tour['tautodist']
    dist_by_tour_purp = pd.DataFrame(tour.groupby('pdpurp').sum()['tautodist_wt']/tour.groupby('pdpurp').sum()['toexpfac'],
                                    columns=['dist_by_tour_purp'])
    dist_by_tour_purp['purpose'] = dist_by_tour_purp.index

    # Trip distance by purpose
    trip['travdist_wt'] = trip['trexpfac']*trip['travdist']
    dist_by_trip_purp = pd.DataFrame(trip.groupby('dpurp').sum()['travdist_wt']/trip.groupby('dpurp').sum()['trexpfac'],
                                    columns=['dist_by_trip_purp'])
    dist_by_trip_purp['purpose'] = dist_by_trip_purp.index

    # number of total tours generated by purpose
    total_tours = pd.DataFrame(tour.groupby('pdpurp').sum()['toexpfac'])
    total_tours.rename(columns={'toexpfac':'total_tours'},inplace=True)
    total_tours['purpose'] = total_tours.index
    total_tours.reset_index(inplace=True, drop=True)

    # trip mode share
    trips_by_mode = pd.pivot_table(trip,values='trexpfac',index='dpurp',columns='mode',aggfunc='sum')
    trips_by_mode.columns = [i+'_trips' for i in trips_by_mode.columns]
    trips_by_mode['purpose'] = trips_by_mode.index
    trips_by_mode.reset_index(inplace=True, drop=True)

    # tour mode share
    tours_by_mode = pd.pivot_table(tour,values='toexpfac',index='pdpurp',columns='tmodetp',aggfunc='sum')
    tours_by_mode.columns = [i+'_tours' for i in tours_by_mode.columns]
    tours_by_mode['purpose'] = tours_by_mode.index
    tours_by_mode.reset_index(inplace=True, drop=True)

    # merge all dataframes with data by purpose
    df_list = [dist_by_tour_purp, dist_by_trip_purp, total_tours, tours_by_mode, trips_by_mode]
    purpose_df = reduce(lambda left,right: pd.merge(left,right,on='purpose'), df_list)

    # If data source is survey, drop "Other mode" columns
    if dataset['name'] == 'survey':
        purpose_df.drop(['Other_tours','Other_trips'],axis=1,inplace=True)
    
    # add datasource field
    purpose_df['source'] = dataset['name']
    
    return purpose_df

In [507]:
def mode_summary(dataset):
    
    tour = dataset['Tour']
    trip = dataset['Trip']
    hh = dataset['Household']
    person = dataset['Person']
    
    # Tour distance by purpose
    tour['tautodist_wt'] = tour['toexpfac']*tour['tautodist']
    dist_by_tour_mode = pd.DataFrame(tour.groupby('tmodetp').sum()['tautodist_wt']/tour.groupby('tmodetp').sum()['toexpfac'],
                                    columns=['dist_by_tour_mode'])
    dist_by_tour_mode['mode'] = dist_by_tour_mode.index

    # Trip distance by purpose
    trip['travdist_wt'] = trip['trexpfac']*trip['travdist']
    dist_by_trip_mode = pd.DataFrame(trip.groupby('mode').sum()['travdist_wt']/trip.groupby('mode').sum()['trexpfac'],
                                    columns=['dist_by_trip_mode'])
    dist_by_trip_mode['mode'] = dist_by_trip_mode.index

    # merge all dataframes with data by purpose
    df_list = [dist_by_tour_mode, dist_by_trip_mode]
    mode_df = reduce(lambda left,right: pd.merge(left,right,on='mode'), df_list)

    # add dataset name
    mode_df['source'] = dataset['name']
    
    return mode_df

In [509]:
def district_summary(dataset):
    
    
    # Trips by Destination District
    trip_district = pd.merge(trip[['dtaz','trexpfac']],districts,left_on='dtaz',right_on='taz')
    trip_district_df = pd.DataFrame(trip_district.groupby('district_name')['trexpfac'].sum())
    trip_district_df.rename(columns={'trexpfac':'trips'},inplace=True)
    trip_district_df.reset_index(inplace=True)

    # Tours by Destination District
    tour_district = pd.merge(tour[['tdtaz','toexpfac']],districts,left_on='tdtaz',right_on='taz')
    tour_district_df = pd.DataFrame(tour_district.groupby('district_name')['toexpfac'].sum())
    tour_district_df.rename(columns={'toexpfac':'tours'},inplace=True)
    tour_district_df.reset_index(inplace=True)

    # Residents by District
    hh_per_district = pd.merge(hh[['hhtaz','hhsize','hhexpfac','hhno']], districts, left_on='hhtaz', right_on='taz')

    hh_per_district['wt_hhsize'] = hh_per_district['hhsize']*hh_per_district['hhexpfac']
    residents_df = pd.DataFrame(hh_per_district.groupby('district_name').sum()['wt_hhsize'])
    residents_df.rename(columns={'wt_hhsize':'residents'},inplace=True)
    residents_df.reset_index(inplace=True)

    # Merge these all together
    df_list = [trip_district_df, tour_district_df, residents_df]
    district_df = reduce(lambda left,right: pd.merge(left,right,on='district_name'), df_list)

    # add dataset name
    district_df['source'] = dataset['name']
    
    return district_df

In [544]:
def network_results(model_dir, dataset_name):
    '''
    Process network-level soundcast results and export to csv
    '''
    
    df = pd.read_excel(os.path.join(model_dir,r'outputs/network_summary_detailed.xlsx'), sheetname='Network Summary')
    df = pd.DataFrame(df.stack())
    df['tod']= df.index.get_level_values(0)
    df['fieldname'] = df.index.get_level_values(1)
    df.rename(columns={0:'model_value'},inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    # Drop the rows with TP_4k column headers
    df.drop(df[df['fieldname'] == 'TP_4k'].index, inplace=True)
    
    # Split the fields by vmt, vht, delay
    df['facility_type'] = df.fieldname.apply(lambda row: row.split('_')[0])
    df['metric'] = df.fieldname.apply(lambda row: row.split('_')[-1])

    # add dataset name
    df['source'] = dataset_name
    
    return df


In [529]:
survey['Trip'].groupby('dpurp').count()

Unnamed: 0_level_0,arrtm,dadtyp,day,deptm,dorp,dpcl,dtaz,endacttm,half,hhno,...,pathtype,pno,tour,travcost,travdist,travtime,trexpfac,tseg,tsvid,travdist_wt
dpurp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Change Mode Inserted Purpose,271,271,271,271,271,271,271,271,271,271,...,271,271,271,271,271,271,271,271,271,271
Escort,2930,2930,2930,2930,2706,2930,2930,2930,2930,2930,...,2930,2930,2930,2930,2930,2930,2930,2930,2930,2930
Meal,3272,3272,3272,3272,1736,3272,3272,3272,3272,3272,...,3272,3272,3272,3272,3272,3272,3272,3272,3272,3272
None/Home,18364,18364,18364,18364,11674,18364,18364,18364,18364,18364,...,18364,18364,18364,18364,18364,18364,18364,18364,18364,18364
Personal Business,5392,5392,5392,5392,4014,5392,5392,5392,5392,5392,...,5392,5392,5392,5392,5392,5392,5392,5392,5392,5392
School,4356,4356,4356,4356,1042,4356,4356,4356,4356,4356,...,4356,4356,4356,4356,4356,4356,4356,4356,4356,4356
Shop,6152,6152,6152,6152,4547,6152,6152,6152,6152,6152,...,6152,6152,6152,6152,6152,6152,6152,6152,6152,6152
Social,5320,5320,5320,5320,2835,5320,5320,5320,5320,5320,...,5320,5320,5320,5320,5320,5320,5320,5320,5320,5320
Work,8043,8043,8043,8043,5213,8043,8043,8043,8043,8043,...,8043,8043,8043,8043,8043,8043,8043,8043,8043,8043


In [528]:
daysim['Trip'].groupby('dpurp')['id'].count()

dpurp
Change Mode Inserted Purpose      78026
Escort                          1724201
Meal                            1036412
None/Home                       5225334
Personal Business               2078084
School                           953771
Shop                             815752
Social                          1326037
Work                            2160159
Name: id, dtype: int64

In [200]:
# # Add unique person ID 
# survey['Trip']['unique_id'] = survey['Trip']['hhno'].astype('str') + '_' + survey['Trip']['pno'].astype('str')
# survey['Person']['unique_id'] = survey['Person']['hhno'].astype('str') + '_' + survey['Person']['pno'].astype('str')

# daysim['Trip']['unique_id'] = daysim['Trip']['hhno'].astype('str') + '_' + daysim['Trip']['pno'].astype('str')
# daysim['Person']['unique_id'] = daysim['Person']['hhno'].astype('str') + '_' + daysim['Person']['pno'].astype('str')

## Add Labels

## Define dataset
- model or observed

In [439]:
# dataset = daysim
dataset = survey


## Aggregate Measures
 - total persons, households, avg household size, avg trips/person, vmt per person, avg distance to work, avg distance to school

In [203]:
df = pd.DataFrame()

# Total Persons
df = add_row(df, row_name='total_persons', description='Total Persons', value=dataset['Person']['psexpfac'].sum())

# Total Households
df = add_row(df, row_name='total_hhs', description='Total Households', value=dataset['Household']['hhexpfac'].sum())

# Average Household Size
avg_hh_size = (dataset['Household']['hhsize']*dataset['Household']['hhexpfac']).sum()/dataset['Household']['hhexpfac'].sum()
df = add_row(df, row_name='avg_hh_size', description='Average Household Size', value=avg_hh_size)

# Average Trips per Person
trips_per_person = dataset['Trip']['trexpfac'].sum()/dataset['Person']['psexpfac'].sum()
df = add_row(df, row_name='trips_per_person', description='Average Trips per Person', value=trips_per_person)

# Average Trip Length
trip_len = (dataset['Trip']['travdist']*dataset['Trip']['trexpfac']).sum()/dataset['Trip']['trexpfac'].sum()
df = add_row(df, row_name='trip_len', description='Average Trips Length', value=trip_len)

# VMT per capita
driver_trips = dataset['Trip'][dataset['Trip']['dorp'] == 'Driver']
vmt_per_cap = (driver_trips['travdist']*driver_trips['trexpfac']).sum()/dataset['Person']['psexpfac'].sum()
df = add_row(df, row_name='vmt_per_cap', description='VMT per Person', value=vmt_per_cap)

# Average distance to work
to_work_tours = dataset['Tour'][dataset['Tour']['pdpurp'] == 'Work']
dist_to_work = (to_work_tours['tautodist']*to_work_tours['toexpfac']).sum()/to_work_tours['toexpfac'].sum()
df = add_row(df, row_name='dist_to_work', description='Avg Distance to Work', value=dist_to_work)

# Average distance to school
to_school_tours = dataset['Tour'][dataset['Tour']['pdpurp'] == 'School']
dist_to_school = (to_school_tours['tautodist']*to_school_tours['toexpfac']).sum()/to_school_tours['toexpfac'].sum()
df = add_row(df, row_name='dist_to_school', description='Avg Distance to School', value=dist_to_school)

# Transit Pass Ownership
# Total Boardings
# Total Workers
# % workers working at home
# work-at-home workers

## Destination Choice Report



In [440]:
tour = dataset['Tour']
trip = dataset['Trip']
hh = dataset['Household']
person = dataset['Person']

## Results by Trip/Tour Purpose
- distance by tour purpose
- distance by trip purpose
- trips per tour by tour purpose
- number of tours by purpose
- tour mode share
- trip mode share

In [None]:
# Tour distance by purpose
tour['tautodist_wt'] = tour['toexpfac']*tour['tautodist']
dist_by_tour_purp = pd.DataFrame(tour.groupby('pdpurp').sum()['tautodist_wt']/tour.groupby('pdpurp').sum()['toexpfac'],
                                columns=['dist_by_tour_purp'])
dist_by_tour_purp['purpose'] = dist_by_tour_purp.index

# Trip distance by purpose
trip['travdist_wt'] = trip['trexpfac']*trip['travdist']
dist_by_trip_purp = pd.DataFrame(trip.groupby('dpurp').sum()['travdist_wt']/trip.groupby('dpurp').sum()['trexpfac'],
                                columns=['dist_by_trip_purp'])
dist_by_trip_purp['purpose'] = dist_by_trip_purp.index

# number of total tours generated by purpose
total_tours = pd.DataFrame(tour.groupby('pdpurp').sum()['toexpfac'])
total_tours.rename(columns={'toexpfac':'total_tours'},inplace=True)
total_tours['purpose'] = total_tours.index
total_tours.reset_index(inplace=True, drop=True)

# trip mode share
trips_by_mode = pd.pivot_table(trip,values='trexpfac',index='dpurp',columns='mode',aggfunc='sum')
trips_by_mode.columns = [i+'_trips' for i in trips_by_mode.columns]
trips_by_mode['purpose'] = trips_by_mode.index
trips_by_mode.reset_index(inplace=True, drop=True)

# tour mode share
tours_by_mode = pd.pivot_table(tour,values='toexpfac',index='pdpurp',columns='tmodetp',aggfunc='sum')
tours_by_mode.columns = [i+'_tours' for i in tours_by_mode.columns]
tours_by_mode['purpose'] = tours_by_mode.index
tours_by_mode.reset_index(inplace=True, drop=True)

# merge all dataframes with data by purpose
df_list = [dist_by_tour_purp, dist_by_trip_purp, total_tours, tours_by_mode, trips_by_mode]
purpose_df = reduce(lambda left,right: pd.merge(left,right,on='purpose'), df_list)

# add field specif
purpose_df['source'] = dataset['name']
purpose_df.to_csv(dataset['name'] + '_' + 'purpose_df.csv')

## Results By Mode
- distance by tour mode
- trips per tour by tour mode
- trip mode by tour mode
- number of tours by mode
- travel time by mode

In [441]:
# Tour distance by purpose
tour['tautodist_wt'] = tour['toexpfac']*tour['tautodist']
dist_by_tour_mode = pd.DataFrame(tour.groupby('tmodetp').sum()['tautodist_wt']/tour.groupby('tmodetp').sum()['toexpfac'],
                                columns=['dist_by_tour_mode'])
dist_by_tour_mode['mode'] = dist_by_tour_mode.index

# Trip distance by purpose
trip['travdist_wt'] = trip['trexpfac']*trip['travdist']
dist_by_trip_mode = pd.DataFrame(trip.groupby('mode').sum()['travdist_wt']/trip.groupby('mode').sum()['trexpfac'],
                                columns=['dist_by_trip_mode'])
dist_by_trip_mode['mode'] = dist_by_trip_mode.index

# merge all dataframes with data by purpose
df_list = [dist_by_tour_mode, dist_by_trip_mode]
mode_df = reduce(lambda left,right: pd.merge(left,right,on='mode'), df_list)

# add field specif
mode_df['source'] = dataset['name']
mode_df.to_csv(dataset['name'] + '_' + 'mode_df.csv')

## Results By District
- tours by destination district
- trips by destination district
- residents by district

In [424]:
# Trips by Destination District
trip_district = pd.merge(trip[['dtaz','trexpfac']],districts,left_on='dtaz',right_on='taz')
trip_district_df = pd.DataFrame(trip_district.groupby('district_name')['trexpfac'].sum())
trip_district_df.rename(columns={'trexpfac':'trips'},inplace=True)
trip_district_df.reset_index(inplace=True)

# Tours by Destination District
tour_district = pd.merge(tour[['tdtaz','toexpfac']],districts,left_on='tdtaz',right_on='taz')
tour_district_df = pd.DataFrame(tour_district.groupby('district_name')['toexpfac'].sum())
tour_district_df.rename(columns={'toexpfac':'tours'},inplace=True)
tour_district_df.reset_index(inplace=True)

# Residents by District
hh_per_district = pd.merge(hh[['hhtaz','hhsize','hhexpfac','hhno']], districts, left_on='hhtaz', right_on='taz')

hh_per_district['wt_hhsize'] = hh_per_district['hhsize']*hh_per_district['hhexpfac']
residents_df = pd.DataFrame(hh_per_district.groupby('district_name').sum()['wt_hhsize'])
residents_df.rename(columns={'wt_hhsize':'residents'},inplace=True)
residents_df.reset_index(inplace=True)

# Merge these all together
df_list = [trip_district_df, tour_district_df, residents_df]
district_df = reduce(lambda left,right: pd.merge(left,right,on='district_name'), df_list)

# add dataset name
district_df['source'] = dataset['name']

# Network Summary

In [425]:
net_summary_df = pd.read_excel(r'J:\Projects\Soundcast\network_summary_detailed.xlsx', sheetname='Network Summary')
df = pd.DataFrame(net_summary_df.stack())
df['tod']= df.index.get_level_values(0)
df['fieldname'] = df.index.get_level_values(1)
df.rename(columns={0:'model_value'},inplace=True)
df.reset_index(inplace=True, drop=True)
# Drop the rows with TP_4k column headers
df.drop(df[df['fieldname'] == 'TP_4k'].index, inplace=True)
# Split the fields by vmt, vht, delay
df['facility_type'] = df.fieldname.apply(lambda row: row.split('_')[0])
df['metric'] = df.fieldname.apply(lambda row: row.split('_')[-1])

## Time of Day