In [1]:
import pandas as pd
import numpy as np
import sys 
import datetime
import os
import matplotlib as plt
import geopandas as gp

%matplotlib inline

In [None]:
agg = {'Fare':'mean','Trip Total':'mean','Trip Miles':'sum','Trips Pooled':'sum','PRIVATE_TRIPS':'sum','SHARED_TRIPS':'sum','TRIPS':'sum'}

In [None]:
def clean_float_cols(x):
    x = x.replace(',','')
    x = float(x)
    
    return x

In [None]:
def processRawData(infile, outfile):
        """
        Reads data, cleans it, processes it, and writes it to an HDF5 file.
        
        infile  - infile name, raw CSV format
        outfile - output file name, h5 format
        outkey = name of table in output h5 file
        """
        
        print(datetime.datetime.now().ctime(), 'Converting raw data in file: ', infile)
        
        # set up the reader
        reader = pd.read_csv(infile,  
                         iterator = True, 
                         sep = ',', 
        parse_dates = ['Trip Start Timestamp','Trip End Timestamp'], infer_datetime_format = True, chunksize= 25000)

        # establish the writer and clear any table with that file name
        store = pd.HDFStore(outfile)

        # iterate through chunk by chunk so the computer doesn't run out of memory
        rowsRead    = 0
        rowsWritten_weekday = 0
        rowsWritten_weekend = 0
        
        for chunk in reader:   

            rowsRead += len(chunk)
            
            #convert columns from a string because it caused problems
            convert_cols = ['Trip Seconds','Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract','Fare', 
                            'Tip','Additional Charges', 'Trip Total', 'Trips Pooled']
        
            for column in convert_cols:
                chunk[column] = chunk[column].astype(str).apply(lambda x: x.replace(',',''))
                chunk[column] = chunk[column].astype(float)
           
            chunk['YEAR'], chunk['MONTH'], chunk['DOW'], chunk['HOUR'] = chunk['Trip Start Timestamp'].dt.year, chunk['Trip Start Timestamp'].dt.month, chunk['Trip Start Timestamp'].dt.weekday, chunk['Trip Start Timestamp'].dt.hour
            
            chunk_weekday = chunk[chunk['DOW'].isin([0,1,2,3,4])]
            chunk_weekend = chunk[chunk['DOW'].isin([5,6])]

            chunk_weekday_2 = chunk_weekday[chunk_weekday['HOUR'].isin([6])]
            chunk_weekday_3 = chunk_weekday[chunk_weekday['HOUR'].isin([7,8])]
            chunk_weekday_4 = chunk_weekday[chunk_weekday['HOUR'].isin([9])]
            chunk_weekday_5 = chunk_weekday[chunk_weekday['HOUR'].isin([10,11,12,13])]
            chunk_weekday_6 = chunk_weekday[chunk_weekday['HOUR'].isin([14,15])]
            chunk_weekday_7 = chunk_weekday[chunk_weekday['HOUR'].isin([16,17])]
            chunk_weekday_8 = chunk_weekday[chunk_weekday['HOUR'].isin([18,19])]
            chunk_weekday_1 = chunk_weekday[chunk_weekday['HOUR'].isin([20,21,22,23,24,1,2,3,4,5])]
            
            chunk_weekend_2 = chunk_weekend[chunk_weekend['HOUR'].isin([6])]
            chunk_weekend_3 = chunk_weekend[chunk_weekend['HOUR'].isin([7,8])]
            chunk_weekend_4 = chunk_weekend[chunk_weekend['HOUR'].isin([9])]
            chunk_weekend_5 = chunk_weekend[chunk_weekend['HOUR'].isin([10,11,12,13])]
            chunk_weekend_6 = chunk_weekend[chunk_weekend['HOUR'].isin([14,15])]
            chunk_weekend_7 = chunk_weekend[chunk_weekend['HOUR'].isin([16,17])]
            chunk_weekend_8 = chunk_weekend[chunk_weekend['HOUR'].isin([18,19])]
            chunk_weekend_1 = chunk_weekend[chunk_weekend['HOUR'].isin([20,21,22,23,24,1,2,3,4,5])]
            
            
            # write the data
            store.append('Weekday_1', chunk_weekday_1, data_columns = True)
            store.append('Weekday_2', chunk_weekday_2, data_columns = True)
            store.append('Weekday_3', chunk_weekday_3, data_columns = True)
            store.append('Weekday_4', chunk_weekday_4, data_columns = True)
            store.append('Weekday_5', chunk_weekday_5, data_columns = True)
            store.append('Weekday_6', chunk_weekday_6, data_columns = True)
            store.append('Weekday_7', chunk_weekday_7, data_columns = True)
            store.append('Weekday_8', chunk_weekday_8, data_columns = True)

            store.append('Weekend_1', chunk_weekend_1, data_columns = True)
            store.append('Weekend_2', chunk_weekend_2, data_columns = True)
            store.append('Weekend_3', chunk_weekend_3, data_columns = True)
            store.append('Weekend_4', chunk_weekend_4, data_columns = True)
            store.append('Weekend_5', chunk_weekend_5, data_columns = True)
            store.append('Weekend_6', chunk_weekend_6, data_columns = True)
            store.append('Weekend_7', chunk_weekend_7, data_columns = True)
            store.append('Weekend_8', chunk_weekend_8, data_columns = True)

            rowsWritten_weekday += len(chunk_weekday)
            rowsWritten_weekend += len(chunk_weekend)

            print ('Read %i rows and kept %i rows in weekday TNC table' % (rowsRead, rowsWritten_weekday))
            print ('kept ' + str(rowsWritten_weekend) + ' rows in weekend TNC table')

        store.close()
        print('Complete!!!')

In [None]:
#create the shared TNC trip h5 table
processRawData('C:/Workspace/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Transportation_Network_Providers_-_Trips.csv', 'C:/Workspace/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Chicago_TNC_Trips_20.H5')