In [1]:
import pandas as pd
import numpy as np
import sys 
import datetime
import os
import matplotlib as plt

%matplotlib inline

In [2]:
agg = {'Fare':'mean','Trip Total':'mean','Trip Miles':'sum','Trips Pooled':'sum','PRIVATE_TRIPS':'sum','SHARED_TRIPS':'sum','TRIPS':'sum'}

In [3]:
def clean_float_cols(x):
    x = x.replace(',','')
    x = float(x)
    
    return x

In [4]:
def processRawData(infile, outfile):
        """
        Reads data, cleans it, processes it, and writes it to an HDF5 file.
        
        infile  - infile name, raw CSV format
        outfile - output file name, h5 format
        outkey = name of table in output h5 file
        """
        
        print(datetime.datetime.now().ctime(), 'Converting raw data in file: ', infile)
        
        # set up the reader
        reader = pd.read_csv(infile,  
                         iterator = True, 
                         sep = ',', 
        parse_dates = ['Trip Start Timestamp','Trip End Timestamp'], infer_datetime_format = True, chunksize= 25000)

        # establish the writer and clear any table with that file name
        store = pd.HDFStore(outfile)

        # iterate through chunk by chunk so the computer doesn't run out of memory
        rowsRead    = 0
        rowsWritten_weekday = 0
        rowsWritten_weekend = 0
        
        for chunk in reader:   

            rowsRead += len(chunk)
            
            #convert columns from a string because it caused problems
            convert_cols = ['Trip Seconds','Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract','Fare', 
                            'Tip','Additional Charges', 'Trip Total', 'Trips Pooled']
        
            for column in convert_cols:
                chunk[column] = chunk[column].astype(str).apply(lambda x: x.replace(',',''))
                chunk[column] = chunk[column].astype(float)
           
            chunk['YEAR'], chunk['MONTH'], chunk['DOW'], chunk['HOUR'] = chunk['Trip Start Timestamp'].dt.year, chunk['Trip Start Timestamp'].dt.month, chunk['Trip Start Timestamp'].dt.weekday, chunk['Trip Start Timestamp'].dt.hour
            
            chunk_weekday = chunk[chunk['DOW'].isin([0,1,2,3,4])]
            chunk_weekend = chunk[chunk['DOW'].isin([5,6])]

            chunk_weekday_1 = chunk_weekday[chunk_weekday['HOUR'].isin([22,23,24,1,2,3,4,5])]
            chunk_weekday_2 = chunk_weekday[chunk_weekday['HOUR'].isin([6,7,8])]
            chunk_weekday_3 = chunk_weekday[chunk_weekday['HOUR'].isin([9,10,11,12,13,14,15])]
            chunk_weekday_4 = chunk_weekday[chunk_weekday['HOUR'].isin([16,17,18])]
            chunk_weekday_5 = chunk_weekday[chunk_weekday['HOUR'].isin([19,20,21])]

            chunk_weekend_1 = chunk_weekend[chunk_weekend['HOUR'].isin([22,23,24,1,2,3,4,5])]
            chunk_weekend_2 = chunk_weekend[chunk_weekend['HOUR'].isin([6,7,8])]
            chunk_weekend_3 = chunk_weekend[chunk_weekend['HOUR'].isin([9,10,11,12,13,14,15])]
            chunk_weekend_4 = chunk_weekend[chunk_weekend['HOUR'].isin([16,17,18])]
            chunk_weekend_5 = chunk_weekend[chunk_weekend['HOUR'].isin([19,20,21])]

            
            
            # write the data
            store.append('Weekday_1', chunk_weekday_1, data_columns = True)
            store.append('Weekday_2', chunk_weekday_2, data_columns = True)
            store.append('Weekday_3', chunk_weekday_3, data_columns = True)
            store.append('Weekday_4', chunk_weekday_4, data_columns = True)
            store.append('Weekday_5', chunk_weekday_5, data_columns = True)


            store.append('Weekend_1', chunk_weekend_1, data_columns = True)
            store.append('Weekend_2', chunk_weekend_2, data_columns = True)
            store.append('Weekend_3', chunk_weekend_3, data_columns = True)
            store.append('Weekend_4', chunk_weekend_4, data_columns = True)
            store.append('Weekend_5', chunk_weekend_5, data_columns = True)


            rowsWritten_weekday += len(chunk_weekday)
            rowsWritten_weekend += len(chunk_weekend)

            print ('Read %i rows and kept %i rows in weekday TNC table' % (rowsRead, rowsWritten_weekday))
            print ('kept ' + str(rowsWritten_weekend) + ' rows in weekend TNC table')

        store.close()
        print('Complete!!!')

In [5]:
#create the shared TNC trip h5 table
processRawData('D:/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Transportation_Network_Providers_-_Trips.csv', 'D:/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Chicago_TNC_Trips_20.H5')

Fri Jan 15 12:56:23 2021 Converting raw data in file:  D:/TNC-Demand-Model/Inputs/Chicago Ride-Hailing/Transportation_Network_Providers_-_Trips.csv


  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attrib

Read 25000 rows and kept 16883 rows in weekday TNC table
kept 8117 rows in weekend TNC table
Read 50000 rows and kept 33610 rows in weekday TNC table
kept 16390 rows in weekend TNC table
Read 75000 rows and kept 50406 rows in weekday TNC table
kept 24594 rows in weekend TNC table
Read 100000 rows and kept 67205 rows in weekday TNC table
kept 32795 rows in weekend TNC table
Read 125000 rows and kept 84109 rows in weekday TNC table
kept 40891 rows in weekend TNC table
Read 150000 rows and kept 100880 rows in weekday TNC table
kept 49120 rows in weekend TNC table
Read 175000 rows and kept 117666 rows in weekday TNC table
kept 57334 rows in weekend TNC table
Read 200000 rows and kept 134464 rows in weekday TNC table
kept 65536 rows in weekend TNC table
Read 225000 rows and kept 151296 rows in weekday TNC table
kept 73704 rows in weekend TNC table
Read 250000 rows and kept 167975 rows in weekday TNC table
kept 82025 rows in weekend TNC table
Read 275000 rows and kept 184714 rows in weekday 

KeyboardInterrupt: 