In [11]:
import pandas as pd
import numpy as np
import sys 
import datetime
import os
import matplotlib as plt

%matplotlib inline

In [12]:
agg = {'Fare':'mean','Trip Total':'mean','Trip Miles':'sum','Trips Pooled':'sum','PRIVATE_TRIPS':'sum','SHARED_TRIPS':'sum','TRIPS':'sum'}

In [13]:
def clean_float_cols(x):
    x = x.replace(',','')
    x = float(x)
    
    return x

In [14]:
def processRawData(infile, outfile):
        """
        Reads data, cleans it, processes it, and writes it to an HDF5 file.
        
        infile  - infile name, raw CSV format
        outfile - output file name, h5 format
        outkey = name of table in output h5 file
        """
        
        print(datetime.datetime.now().ctime(), 'Converting raw data in file: ', infile)
        
        # set up the reader
        reader = pd.read_csv(infile,  
                         iterator = True, 
                         sep = ',', 
        parse_dates = ['Trip Start Timestamp','Trip End Timestamp'], infer_datetime_format = True, chunksize= 25000)

        # establish the writer and clear any table with that file name
        store = pd.HDFStore(outfile)

        # iterate through chunk by chunk so the computer doesn't run out of memory
        rowsRead    = 0
        rowsWritten_weekday = 0
        rowsWritten_weekend = 0
        
        for chunk in reader:   

            rowsRead += len(chunk)
            
            #convert columns from a string because it caused problems
            convert_cols = ['Trip Seconds','Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract','Fare', 
                            'Tip','Additional Charges', 'Trip Total', 'Trips Pooled']
        
            for column in convert_cols:
                chunk[column] = chunk[column].astype(str).apply(lambda x: x.replace(',',''))
                chunk[column] = chunk[column].astype(float)
           
            chunk['YEAR'], chunk['MONTH'], chunk['DOW'], chunk['HOUR'] = chunk['Trip Start Timestamp'].dt.year, chunk['Trip Start Timestamp'].dt.month, chunk['Trip Start Timestamp'].dt.weekday, chunk['Trip Start Timestamp'].dt.hour
            
            chunk_weekday = chunk[chunk['DOW'].isin([0,1,2,3,4])]
            chunk_weekend = chunk[chunk['DOW'].isin([5,6])]

            chunk_weekday_1 = chunk_weekday[chunk_weekday['HOUR'].isin([22,23,24,1,2,3,4,5])]
            chunk_weekday_2 = chunk_weekday[chunk_weekday['HOUR'].isin([6,7,8])]
            chunk_weekday_3 = chunk_weekday[chunk_weekday['HOUR'].isin([9,10,11,12,13,14,15])]
            chunk_weekday_4 = chunk_weekday[chunk_weekday['HOUR'].isin([16,17,18])]
            chunk_weekday_5 = chunk_weekday[chunk_weekday['HOUR'].isin([19,20,21])]

            chunk_weekend_1 = chunk_weekend[chunk_weekend['HOUR'].isin([22,23,24,1,2,3,4,5])]
            chunk_weekend_2 = chunk_weekend[chunk_weekend['HOUR'].isin([6,7,8])]
            chunk_weekend_3 = chunk_weekend[chunk_weekend['HOUR'].isin([9,10,11,12,13,14,15])]
            chunk_weekend_4 = chunk_weekend[chunk_weekend['HOUR'].isin([16,17,18])]
            chunk_weekend_5 = chunk_weekend[chunk_weekend['HOUR'].isin([19,20,21])]

            
            
            # write the data
            store.append('Weekday_1', chunk_weekday_1, data_columns = True)
            store.append('Weekday_2', chunk_weekday_2, data_columns = True)
            store.append('Weekday_3', chunk_weekday_3, data_columns = True)
            store.append('Weekday_4', chunk_weekday_4, data_columns = True)
            store.append('Weekday_5', chunk_weekday_5, data_columns = True)


            store.append('Weekend_1', chunk_weekend_1, data_columns = True)
            store.append('Weekend_2', chunk_weekend_2, data_columns = True)
            store.append('Weekend_3', chunk_weekend_3, data_columns = True)
            store.append('Weekend_4', chunk_weekend_4, data_columns = True)
            store.append('Weekend_5', chunk_weekend_5, data_columns = True)


            rowsWritten_weekday += len(chunk_weekday)
            rowsWritten_weekend += len(chunk_weekend)

            print ('Read %i rows and kept %i rows in weekday TNC table' % (rowsRead, rowsWritten_weekday))
            print ('kept ' + str(rowsWritten_weekend) + ' rows in weekend TNC table')

        store.close()
        print('Complete!!!')

In [None]:
#create the shared TNC trip h5 table
processRawData('D:/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Transportation_Network_Providers_-_Trips.csv', 'D:/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Chicago_TNC_Trips_20.H5')

Thu Jan 14 09:33:02 2021 Converting raw data in file:  D:/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Transportation_Network_Providers_-_Trips.csv


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attrib

Read 25000 rows and kept 16883 rows in weekday TNC table
kept 8117 rows in weekend TNC table
Read 50000 rows and kept 33610 rows in weekday TNC table
kept 16390 rows in weekend TNC table
Read 75000 rows and kept 50406 rows in weekday TNC table
kept 24594 rows in weekend TNC table
Read 100000 rows and kept 67205 rows in weekday TNC table
kept 32795 rows in weekend TNC table
Read 125000 rows and kept 84109 rows in weekday TNC table
kept 40891 rows in weekend TNC table
Read 150000 rows and kept 100880 rows in weekday TNC table
kept 49120 rows in weekend TNC table
Read 175000 rows and kept 117666 rows in weekday TNC table
kept 57334 rows in weekend TNC table
Read 200000 rows and kept 134464 rows in weekday TNC table
kept 65536 rows in weekend TNC table
Read 225000 rows and kept 151296 rows in weekday TNC table
kept 73704 rows in weekend TNC table
Read 250000 rows and kept 167975 rows in weekday TNC table
kept 82025 rows in weekend TNC table
Read 275000 rows and kept 184714 rows in weekday 

Read 2125000 rows and kept 1428618 rows in weekday TNC table
kept 696382 rows in weekend TNC table
Read 2150000 rows and kept 1445488 rows in weekday TNC table
kept 704512 rows in weekend TNC table
Read 2175000 rows and kept 1462290 rows in weekday TNC table
kept 712710 rows in weekend TNC table
Read 2200000 rows and kept 1479172 rows in weekday TNC table
kept 720828 rows in weekend TNC table
Read 2225000 rows and kept 1495794 rows in weekday TNC table
kept 729206 rows in weekend TNC table
Read 2250000 rows and kept 1512524 rows in weekday TNC table
kept 737476 rows in weekend TNC table
Read 2275000 rows and kept 1529356 rows in weekday TNC table
kept 745644 rows in weekend TNC table
Read 2300000 rows and kept 1546230 rows in weekday TNC table
kept 753770 rows in weekend TNC table
Read 2325000 rows and kept 1562905 rows in weekday TNC table
kept 762095 rows in weekend TNC table
Read 2350000 rows and kept 1579800 rows in weekday TNC table
kept 770200 rows in weekend TNC table
Read 23750

Read 4200000 rows and kept 2823754 rows in weekday TNC table
kept 1376246 rows in weekend TNC table
Read 4225000 rows and kept 2840487 rows in weekday TNC table
kept 1384513 rows in weekend TNC table
Read 4250000 rows and kept 2857319 rows in weekday TNC table
kept 1392681 rows in weekend TNC table
Read 4275000 rows and kept 2874228 rows in weekday TNC table
kept 1400772 rows in weekend TNC table
Read 4300000 rows and kept 2890913 rows in weekday TNC table
kept 1409087 rows in weekend TNC table
Read 4325000 rows and kept 2907671 rows in weekday TNC table
kept 1417329 rows in weekend TNC table
Read 4350000 rows and kept 2924417 rows in weekday TNC table
kept 1425583 rows in weekend TNC table
Read 4375000 rows and kept 2941286 rows in weekday TNC table
kept 1433714 rows in weekend TNC table
Read 4400000 rows and kept 2958236 rows in weekday TNC table
kept 1441764 rows in weekend TNC table
Read 4425000 rows and kept 2974949 rows in weekday TNC table
kept 1450051 rows in weekend TNC table


Read 6250000 rows and kept 4202346 rows in weekday TNC table
kept 2047654 rows in weekend TNC table
Read 6275000 rows and kept 4219023 rows in weekday TNC table
kept 2055977 rows in weekend TNC table
Read 6300000 rows and kept 4235947 rows in weekday TNC table
kept 2064053 rows in weekend TNC table
Read 6325000 rows and kept 4252762 rows in weekday TNC table
kept 2072238 rows in weekend TNC table
Read 6350000 rows and kept 4269569 rows in weekday TNC table
kept 2080431 rows in weekend TNC table
Read 6375000 rows and kept 4286478 rows in weekday TNC table
kept 2088522 rows in weekend TNC table
Read 6400000 rows and kept 4303259 rows in weekday TNC table
kept 2096741 rows in weekend TNC table
Read 6425000 rows and kept 4320138 rows in weekday TNC table
kept 2104862 rows in weekend TNC table
Read 6450000 rows and kept 4336963 rows in weekday TNC table
kept 2113037 rows in weekend TNC table
Read 6475000 rows and kept 4353849 rows in weekday TNC table
kept 2121151 rows in weekend TNC table


Read 8300000 rows and kept 5577292 rows in weekday TNC table
kept 2722708 rows in weekend TNC table
Read 8325000 rows and kept 5594044 rows in weekday TNC table
kept 2730956 rows in weekend TNC table
Read 8350000 rows and kept 5610962 rows in weekday TNC table
kept 2739038 rows in weekend TNC table
Read 8375000 rows and kept 5627666 rows in weekday TNC table
kept 2747334 rows in weekend TNC table
Read 8400000 rows and kept 5644324 rows in weekday TNC table
kept 2755676 rows in weekend TNC table
Read 8425000 rows and kept 5661029 rows in weekday TNC table
kept 2763971 rows in weekend TNC table
Read 8450000 rows and kept 5677652 rows in weekday TNC table
kept 2772348 rows in weekend TNC table
Read 8475000 rows and kept 5694451 rows in weekday TNC table
kept 2780549 rows in weekend TNC table
Read 8500000 rows and kept 5711188 rows in weekday TNC table
kept 2788812 rows in weekend TNC table
Read 8525000 rows and kept 5727944 rows in weekday TNC table
kept 2797056 rows in weekend TNC table


Read 10350000 rows and kept 6946479 rows in weekday TNC table
kept 3403521 rows in weekend TNC table
Read 10375000 rows and kept 6963239 rows in weekday TNC table
kept 3411761 rows in weekend TNC table
Read 10400000 rows and kept 6979972 rows in weekday TNC table
kept 3420028 rows in weekend TNC table
Read 10425000 rows and kept 6996599 rows in weekday TNC table
kept 3428401 rows in weekend TNC table
Read 10450000 rows and kept 7013259 rows in weekday TNC table
kept 3436741 rows in weekend TNC table
Read 10475000 rows and kept 7029912 rows in weekday TNC table
kept 3445088 rows in weekend TNC table
Read 10500000 rows and kept 7046560 rows in weekday TNC table
kept 3453440 rows in weekend TNC table
Read 10525000 rows and kept 7063262 rows in weekday TNC table
kept 3461738 rows in weekend TNC table
Read 10550000 rows and kept 7079989 rows in weekday TNC table
kept 3470011 rows in weekend TNC table
Read 10575000 rows and kept 7096754 rows in weekday TNC table
kept 3478246 rows in weekend 

Read 12400000 rows and kept 8316518 rows in weekday TNC table
kept 4083482 rows in weekend TNC table
Read 12425000 rows and kept 8333162 rows in weekday TNC table
kept 4091838 rows in weekend TNC table
Read 12450000 rows and kept 8349753 rows in weekday TNC table
kept 4100247 rows in weekend TNC table
Read 12475000 rows and kept 8366365 rows in weekday TNC table
kept 4108635 rows in weekend TNC table
Read 12500000 rows and kept 8382932 rows in weekday TNC table
kept 4117068 rows in weekend TNC table
Read 12525000 rows and kept 8399574 rows in weekday TNC table
kept 4125426 rows in weekend TNC table
Read 12550000 rows and kept 8416162 rows in weekday TNC table
kept 4133838 rows in weekend TNC table
Read 12575000 rows and kept 8432765 rows in weekday TNC table
kept 4142235 rows in weekend TNC table
Read 12600000 rows and kept 8449381 rows in weekday TNC table
kept 4150619 rows in weekend TNC table
Read 12625000 rows and kept 8465970 rows in weekday TNC table
kept 4159030 rows in weekend 

Read 14450000 rows and kept 9685504 rows in weekday TNC table
kept 4764496 rows in weekend TNC table
Read 14475000 rows and kept 9702195 rows in weekday TNC table
kept 4772805 rows in weekend TNC table
Read 14500000 rows and kept 9718945 rows in weekday TNC table
kept 4781055 rows in weekend TNC table
Read 14525000 rows and kept 9735573 rows in weekday TNC table
kept 4789427 rows in weekend TNC table
Read 14550000 rows and kept 9752149 rows in weekday TNC table
kept 4797851 rows in weekend TNC table
Read 14575000 rows and kept 9768866 rows in weekday TNC table
kept 4806134 rows in weekend TNC table
Read 14600000 rows and kept 9785602 rows in weekday TNC table
kept 4814398 rows in weekend TNC table
Read 14625000 rows and kept 9802328 rows in weekday TNC table
kept 4822672 rows in weekend TNC table
Read 14650000 rows and kept 9819002 rows in weekday TNC table
kept 4830998 rows in weekend TNC table
Read 14675000 rows and kept 9835751 rows in weekday TNC table
kept 4839249 rows in weekend 

Read 16475000 rows and kept 11038819 rows in weekday TNC table
kept 5436181 rows in weekend TNC table
Read 16500000 rows and kept 11055452 rows in weekday TNC table
kept 5444548 rows in weekend TNC table
Read 16525000 rows and kept 11072279 rows in weekday TNC table
kept 5452721 rows in weekend TNC table
Read 16550000 rows and kept 11089010 rows in weekday TNC table
kept 5460990 rows in weekend TNC table
Read 16575000 rows and kept 11105628 rows in weekday TNC table
kept 5469372 rows in weekend TNC table
Read 16600000 rows and kept 11122210 rows in weekday TNC table
kept 5477790 rows in weekend TNC table
Read 16625000 rows and kept 11138745 rows in weekday TNC table
kept 5486255 rows in weekend TNC table
Read 16650000 rows and kept 11155412 rows in weekday TNC table
kept 5494588 rows in weekend TNC table
Read 16675000 rows and kept 11172071 rows in weekday TNC table
kept 5502929 rows in weekend TNC table
Read 16700000 rows and kept 11188813 rows in weekday TNC table
kept 5511187 rows i

Read 18500000 rows and kept 12619913 rows in weekday TNC table
kept 5880087 rows in weekend TNC table
Read 18525000 rows and kept 12644913 rows in weekday TNC table
kept 5880087 rows in weekend TNC table
Read 18550000 rows and kept 12669913 rows in weekday TNC table
kept 5880087 rows in weekend TNC table
Read 18575000 rows and kept 12694909 rows in weekday TNC table
kept 5880091 rows in weekend TNC table
Read 18600000 rows and kept 12719909 rows in weekday TNC table
kept 5880091 rows in weekend TNC table
Read 18625000 rows and kept 12744909 rows in weekday TNC table
kept 5880091 rows in weekend TNC table
Read 18650000 rows and kept 12769909 rows in weekday TNC table
kept 5880091 rows in weekend TNC table
Read 18675000 rows and kept 12794909 rows in weekday TNC table
kept 5880091 rows in weekend TNC table
Read 18700000 rows and kept 12819909 rows in weekday TNC table
kept 5880091 rows in weekend TNC table
Read 18725000 rows and kept 12844909 rows in weekday TNC table
kept 5880091 rows i