# Merging of Weather, Holidays and Taxi Dataframes

In [1]:
import pandas as pd
import os
import numpy as np
import time

In [2]:
if os.name == 'nt':
    sep = '\\'
elif os.name == 'posix':
    sep = '/'
else:
    print(f'What is this OS? {os.name}')

path = os.getcwd()
path_datasets = path[:-len(f'Code{sep}src{sep}project_CSP_MATH_571')] + f'DataSets{sep}'

## Loading Holidays data

In [16]:
def loadHolidaysData(verbose = True):
    if verbose == True:
        time_start = time.time()
    
    df = pd.read_csv(path_datasets + f'holidays{sep}Holidays_data.csv', delimiter=',')
    df = df[['date', 'daytype']].drop_duplicates()
    df_2016_2019 = df[df['date'].str.contains('2019|2018|2017|2016')]
    assert(len(df_2016_2019) == ((365*3)+366)) # Check there are no missing values
    
    if verbose == True:
        print('time taken to load Data:', time.time() - time_start, 'seconds')
        
    return df_2016_2019

In [17]:
df_holidays = loadWeatherData()

time taken to load Data: 0.34799790382385254 seconds


## Loading Taxi Data

In [12]:
def loadTaxiData(verbose = True):
    if verbose == True:
        time_start = time.time()
        
    df_taxi = pd.read_csv(path_datasets + 'jan-ap19.csv', delimiter=',')
    df_taxi_1 = pd.read_csv(path_datasets + 'jan-may17.csv', delimiter=',')
    df_taxi_2 = pd.read_csv(path_datasets + 'june17-feb18.csv', delimiter=',')
    df_taxi_3 = pd.read_csv(path_datasets + 'mar-aug18.csv', delimiter=',')
    df_taxi_4 = pd.read_csv(path_datasets + 'may-nov19.csv', delimiter=',')
    df_taxi_5 = pd.read_csv(path_datasets + 'sep-dec18.csv', delimiter=',')

    df_taxi_total = pd.concat([df_taxi, df_taxi_1, df_taxi_2, df_taxi_3, df_taxi_4, df_taxi_5])
    
    df_date = df_taxi_total['trip_start_timestamp'].str.split(' ').apply(lambda x : x[0])
    
    dates = pd.to_datetime(df_date)
    df_taxi_total['date'] = dates.dt.strftime('%m/%d/%Y')
    
    if verbose == True:
        print('time taken to load Data:', time.time() - time_start, 'seconds')
    
    return df_taxi_total

In [13]:
df_taxi = loadTaxiData()

time taken to load Data: 107.59803366661072 seconds


## Loading Weather Data

In [28]:
from datetime import datetime, timedelta
import time

In [29]:
def createSpaciatedTime():

    def datetime_range(start, end, delta):
        current = start
        while current < end:
            yield current
            current += delta

    dts = [dt.strftime('%Y-%m-%d %H:%M:%S+00:00') for dt in 
          datetime_range(datetime(2017,1,1,1), datetime(2020,1,1,0), 
                         timedelta(minutes = 15))]
    return dts

In [30]:
def buildNewWeather(dts):
    
    df_weather_new = pd.DataFrame(dts, columns = ['DateTime'])
    df_weather_new['DateTime'] = pd.to_datetime(df_weather_new['DateTime'])
    df_weather_new['tmpf'] = np.nan
    df_weather_new['relh'] = np.nan
    df_weather_new['drct'] = np.nan
    df_weather_new['sknt'] = np.nan
    df_weather_new['p01m'] = np.nan
    df_weather_new['skyc1'] = np.nan
    
    return df_weather_new

In [49]:
def fillNewWeather(df_weather_new, df_weather, verbose = True):
    
    if verbose == True:
        print('----------------- Program to fill the weather -----------------')
        print()
        counter = 0
        time_start = time.time()
        
    position = 0

    for i in range(0, len(df_weather_new)):

        datetime = df_weather_new['DateTime'].iloc[i]

        found = False
        while (found == False):
            if position == len(df_weather) - 1: # in case df_weather_new has newer dates than df_weather
                found = True
            elif ((datetime < df_weather['DateTime'].iloc[position + 1]) &  (datetime >= df_weather['DateTime'].iloc[position])):
                df_weather_new['tmpf'].iloc[i] = df_weather['tmpf'].iloc[position]
                df_weather_new['relh'].iloc[i] = df_weather['relh'].iloc[position]
                df_weather_new['drct'].iloc[i] = df_weather['drct'].iloc[position]
                df_weather_new['sknt'].iloc[i] = df_weather['sknt'].iloc[position]
                df_weather_new['p01m'].iloc[i] = df_weather['p01m'].iloc[position]
                df_weather_new['skyc1'].iloc[i] = df_weather['skyc1'].iloc[position]
                found = True
            else:
                position += 1
                
        if verbose == True:
            if i % 5000 == 0:
                print('The program has done', counter *5000, 'iterations')
                print('time taken till now: ', time.time() - time_start, 'seconds')
                print()
                counter +=1
            
    return df_weather_new
    

In [50]:
def loadAndConvertWeather():
    df_weather = pd.read_csv(path_datasets + f'weather{sep}ORD weather.txt', delimiter=',')
    del df_weather['station']
    
    # In order to have a column with localization. Needed for merging.
    df_weather['DateTime'] = pd.to_datetime(df_weather['valid'])
    df_weather = df_weather.set_index('DateTime')
    df_weather = df_weather.tz_localize('UTC')
    df_weather = df_weather.reset_index()
    
    dts = createSpaciatedTime()
    
    [datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00') for x in dts]
    
    df_weather_new = buildNewWeather(dts)
    
    df_weather_new = fillNewWeather(df_weather_new, df_weather, verbose = True)
    
    
    return df_weather_new.dropna()

In [51]:
df_weather = loadAndConvertWeather()

----------------- Program to fill the weather -----------------

The program has done 0 iterations
time taken till now:  0.08999991416931152 seconds

The program has done 5000 iterations
time taken till now:  59.7050416469574 seconds

The program has done 10000 iterations
time taken till now:  122.88649582862854 seconds

The program has done 15000 iterations
time taken till now:  198.4596905708313 seconds

The program has done 20000 iterations
time taken till now:  273.89329743385315 seconds

The program has done 25000 iterations
time taken till now:  346.2832896709442 seconds

The program has done 30000 iterations
time taken till now:  416.86929082870483 seconds

The program has done 35000 iterations
time taken till now:  488.60733914375305 seconds

The program has done 40000 iterations
time taken till now:  569.1946635246277 seconds

The program has done 45000 iterations
time taken till now:  642.3226225376129 seconds

The program has done 50000 iterations
time taken till now:  779.3

# Merge dataframes

In [52]:
def mergeDataframes(df_taxi, df_weather, df_holidays, verbose = True):
    
    if verbose == True:
        time_start = time.time()
    
    df_taxi['DateTime'] = pd.to_datetime(df_taxi['trip_start_timestamp'])

    df_taxi_holidays = df_taxi.merge(df_holidays, how = 'left', on = 'date')

    df_taxi_holidays_weather = df_taxi_holidays.merge(df_weather, how = 'left', on = 'DateTime')
    
    if verbose == True:
        print('time taken to merge Data:', time.time() - time_start, 'seconds')
    
    return df_taxi_holidays_weather
    
    
    
    

In [53]:
df_merged = mergeDataframes(df_taxi, df_weather, df_holidays, verbose = True)

hello
hello
time taken to merge Data: 8.67296028137207 seconds


In [54]:
df_merged

Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,dropoff_location,date,DateTime,daytype,tmpf,relh,drct,sknt,p01m,skyc1
0,ececa29f7a1392f7d937f34a2040f409744d0354,8d1222551aa0783ed24c0941a29222b21fe66e643a801a...,2019-01-17 11:30:00 UTC,2019-01-17 12:00:00 UTC,1363.0,15.50,,,41.0,,...,,01/17/2019,2019-01-17 11:30:00+00:00,W,30.90,92.18,100.00,3.00,0.00,OVC
1,1fbf17a48aca428fefc0a79d0218ed729ea7e8e3,c0efb2f0d92d8721d64fcd6628a9f8e78b7693cb5f36ee...,2019-03-02 09:45:00 UTC,2019-03-02 10:45:00 UTC,2754.0,13.73,,,46.0,,...,,03/02/2019,2019-03-02 09:45:00+00:00,A,28.00,77.57,320.00,10.00,T,BKN
2,73eab95221395356ad45ce8b7a8716bcddb4c034,5e00ac77728ae1790a517495300680a2ff69fe9a569218...,2019-02-12 11:15:00 UTC,2019-02-12 12:15:00 UTC,3422.0,31.77,,,8.0,,...,,02/12/2019,2019-02-12 11:15:00+00:00,W,33.10,91.51,0.00,0.00,0.00,OVC
3,701fcdd02b0dd0dc01cbffcd7c2c9572386f5966,e5e1bb9c3329c0f9bd1f291cb9bbbb016731c148fefca8...,2019-02-15 03:30:00 UTC,2019-02-15 03:45:00 UTC,1302.0,10.16,,,8.0,,...,,02/15/2019,2019-02-15 03:30:00+00:00,W,19.00,80.52,280.00,17.00,0.00,OVC
4,2984246ff88a9e42cb7d80d5a056a91592cd2103,0574d247700e50d1fb996084c8b1c649bf57effe419a6d...,2019-02-27 07:00:00 UTC,2019-02-27 07:45:00 UTC,2308.0,14.26,,,8.0,,...,,02/27/2019,2019-02-27 07:00:00+00:00,W,25.00,77.62,0.00,0.00,0.00,BKN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8498877,018840e82a1a353684d1d6c61009a663d283e5b8,992641c74ae28a22b5d72e1f9fcc89ebda221b88c7bb03...,2018-12-05 18:30:00 UTC,2018-12-05 18:45:00 UTC,555.0,0.90,1.703106e+10,1.703106e+10,6.0,6.0,...,POINT (-87.6385749205 41.938391257700005),12/05/2018,2018-12-05 18:30:00+00:00,W,32.00,72.30,210.00,9.00,0.00,OVC
8498878,50ca7e823090231097cab491f6befb21037a234e,6b00c6e07523bf0964046af1bf69575f25c6ea393f288d...,2018-11-04 10:15:00 UTC,2018-11-04 10:15:00 UTC,8.0,0.00,1.703106e+10,1.703106e+10,6.0,6.0,...,POINT (-87.6385749205 41.938391257700005),11/04/2018,2018-11-04 10:15:00+00:00,U,48.90,79.98,140.00,11.00,1.27,BKN
8498879,9b6efafd86cb388167067fb7e8b0aa8e0a51375c,78577da1f3a925bb4133a9b773f5998cec07219c6112dc...,2018-11-03 15:00:00 UTC,2018-11-03 15:00:00 UTC,132.0,0.30,1.703106e+10,1.703106e+10,6.0,6.0,...,POINT (-87.6385749205 41.938391257700005),11/03/2018,2018-11-03 15:00:00+00:00,A,48.00,47.32,90.00,6.00,0.00,FEW
8498880,70e6ced9f17d37c8e57b00fd918934af5c107c07,621c0e55f444cd5e3a30b64e1f7831974e6ef025e6e3ed...,2018-10-19 12:30:00 UTC,2018-10-19 12:30:00 UTC,10.0,0.00,1.703123e+10,1.703123e+10,23.0,23.0,...,POINT (-87.7186707951 41.891754202600005),10/19/2018,2018-10-19 12:30:00+00:00,W,50.00,71.07,200.00,12.00,T,SCT
