# Merging of Weather, Holidays and Taxi Dataframes

In [125]:
import pandas as pd
import os
import numpy as np
import time
from datetime import datetime

In [126]:
if os.name == 'nt':
    sep = '\\'
elif os.name == 'posix':
    sep = '/'
else:
    print(f'What is this OS? {os.name}')

path = os.getcwd()
path_datasets = path[:-len(f'Code{sep}src{sep}project_CSP_MATH_571')] + f'DataSets{sep}'

## Loading Holidays data

In [128]:
def loadHolidaysData(verbose = True):
    if verbose == True:
        time_start = time.time()
    
    df = pd.read_csv('holiday_df.csv', delimiter=',')
    df = df[['date', 'daytype']].drop_duplicates()
    df_2016_2019 = df[df['date'].str.contains('2019|2018|2017|2016')]
    df_date = df_2016_2019['date'].str.split(' ').apply(lambda x : x[0])
    
    dates = pd.to_datetime(df_date)
    df_2016_2019['date'] = dates.dt.strftime('%m/%d/%Y')
    #assert(len(df_2016_2019) == ((365*3)+366)) # Check there are no missing values
    
    if verbose == True:
        print('time taken to load Data:', time.time() - time_start, 'seconds')
        
    return df_2016_2019

In [129]:
df_holidays = loadHolidaysData()

time taken to load Data: 0.03300118446350098 seconds


In [130]:
df_holidays

Unnamed: 0,date,daytype
0,01/02/2017,U
1,01/03/2017,W
2,01/04/2017,W
3,01/05/2017,W
4,01/06/2017,W
...,...,...
1088,12/26/2019,W
1089,12/27/2019,W
1090,12/28/2019,A
1091,12/29/2019,U


In [122]:
df_holidays.isnull().sum()

date       0
daytype    0
dtype: int64

## Loading Taxi Data

In [84]:
def loadTaxiData(verbose = True):
    if verbose == True:
        time_start = time.time()
        
#     df_taxi = pd.read_csv(path_datasets + 'jan-ap19.csv', delimiter=',')
#     df_taxi_1 = pd.read_csv(path_datasets + 'jan-may17.csv', delimiter=',')
#     df_taxi_2 = pd.read_csv(path_datasets + 'june17-feb18.csv', delimiter=',')
#     df_taxi_3 = pd.read_csv(path_datasets + 'mar-aug18.csv', delimiter=',')
#     df_taxi_4 = pd.read_csv(path_datasets + 'may-nov19.csv', delimiter=',')
#     df_taxi_5 = pd.read_csv(path_datasets + 'sep-dec18.csv', delimiter=',')

#     df_taxi_total = pd.concat([df_taxi, df_taxi_1, df_taxi_2, df_taxi_3, df_taxi_4, df_taxi_5])
    df_taxi_total=pd.read_csv("taxi_clean_df.csv",delimiter=',')
    
    df_date = df_taxi_total['trip_start_timestamp'].str.split(' ').apply(lambda x : x[0])
    
    dates = pd.to_datetime(df_date)
    
    df_taxi_total['date'] = dates.dt.strftime('%m/%d/%Y')
    
    
    if verbose == True:
        print('time taken to load Data:', time.time() - time_start, 'seconds')
    
    return df_taxi_total

In [85]:
df_taxi = loadTaxiData()

time taken to load Data: 70.46820855140686 seconds


In [86]:
df_taxi.head(5)

Unnamed: 0,trip_start_timestamp,unique_key,taxi_id,trip_start_timestamp.1,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,extras,trip_total,payment_type,date
0,2019-01-19 13:30:00,977e465c878cd5337c2215e291ecf091cf2ef4a6,84dab24222c24f99f6c543a5d02e5b47940df071fef3ed...,2019-01-19 13:30:00,2019-01-19 13:30:00,19.0,0.1,14,14,3.5,0.0,0.0,3.5,Cash,01/19/2019
1,2019-01-24 07:15:00,f9eb08f49f09e56188333a95bea1d9d7220a79a9,15ba3f3c77572f9fe6f7cd47e70fa31af10194449cc975...,2019-01-24 07:15:00,2019-01-24 07:30:00,328.0,1.1,14,14,6.0,0.0,0.0,6.0,Cash,01/24/2019
2,2019-02-07 21:45:00,6c40ff49c9e05b30bef35ba2c0c0639fe3506127,b259515344f28837cc7546ff185da59066c4aa0c2ad490...,2019-02-07 21:45:00,2019-02-07 21:45:00,213.0,0.9,14,14,5.5,0.0,0.0,5.5,Cash,02/07/2019
3,2019-02-20 18:00:00,4c0c28580737e54f83246154655e06344e27897c,6551ba527f916e0b8187165d7e61705fc2641c9b3afd69...,2019-02-20 18:00:00,2019-02-20 18:00:00,348.0,0.9,14,14,6.0,0.0,0.0,6.0,Cash,02/20/2019
4,2019-02-13 08:30:00,b01e270fa78b2b711a606dfd41bc7bb10ce2e460,98218edd8c1afe2aa636693c931d34e5bdd76df10e1b6a...,2019-02-13 08:30:00,2019-02-13 08:45:00,435.0,0.8,14,14,6.0,0.0,0.0,6.0,Cash,02/13/2019


In [123]:
df_taxi.isnull().sum()

trip_start_timestamp      0
unique_key                0
taxi_id                   0
trip_start_timestamp.1    0
trip_end_timestamp        0
trip_seconds              0
trip_miles                0
pickup_community_area     0
dropoff_community_area    0
fare                      0
tips                      0
extras                    0
trip_total                0
payment_type              0
date                      0
DateTime                  0
dtype: int64

## Loading Weather Data

In [87]:
from datetime import datetime, timedelta
import time

In [88]:
def createSpaciatedTime():

    def datetime_range(start, end, delta):
        current = start
        while current < end:
            yield current
            current += delta

    dts = [dt.strftime('%Y-%m-%d %H:%M:%S+00:00') for dt in 
          datetime_range(datetime(2017,1,1,1), datetime(2020,1,1,0), 
                         timedelta(minutes = 15))]
    return dts

In [106]:
def buildNewWeather(dts):
    
    df_weather_new = pd.DataFrame(dts, columns = ['DateTime'])
    df_weather_new['DateTime'] = pd.to_datetime(df_weather_new['DateTime'])
    df_weather_new['temperature'] = np.nan
    df_weather_new['relative_humidity'] = np.nan
    df_weather_new['wind_direction'] = np.nan
    df_weather_new['wind_speed'] = np.nan
    df_weather_new['precipitation'] = np.nan
    df_weather_new['sky_level'] = np.nan
    
    return df_weather_new

In [107]:
def fillNewWeather(df_weather_new, df_weather, verbose = True):
    
    if verbose == True:
        print('----------------- Program to fill the weather -----------------')
        print()
        counter = 0
        time_start = time.time()
        
    position = 0

    for i in range(0, len(df_weather_new)):

        datetime = df_weather_new['DateTime'].iloc[i]

        found = False
        while (found == False):
            if position == len(df_weather) - 1: # in case df_weather_new has newer dates than df_weather
                found = True
            elif ((datetime < df_weather['DateTime'].iloc[position + 1]) &  (datetime >= df_weather['DateTime'].iloc[position])):
                df_weather_new['temperature'].iloc[i] = df_weather['temperature'].iloc[position]
                df_weather_new['relative_humidity'].iloc[i] = df_weather['relative_humidity'].iloc[position]
                df_weather_new['wind_direction'].iloc[i] = df_weather['wind_direction'].iloc[position]
                df_weather_new['wind_speed'].iloc[i] = df_weather['wind_speed'].iloc[position]
                df_weather_new['precipitation'].iloc[i] = df_weather['precipitation'].iloc[position]
                df_weather_new['sky_level'].iloc[i] = df_weather['sky_level'].iloc[position]
                found = True
            else:
                position += 1
                
        if verbose == True:
            if i % 5000 == 0:
                print('The program has done', counter *5000, 'iterations')
                print('time taken till now: ', time.time() - time_start, 'seconds')
                print()
                counter +=1
            
    return df_weather_new
    

In [110]:
def loadAndConvertWeather():
    df_weather=pd.read_csv('weather_df.csv', delimiter=',')
    
    del df_weather['station']
    
    # In order to have a column with localization. Needed for merging.
    df_weather['DateTime'] = pd.to_datetime(df_weather['datetime'])
    df_weather = df_weather.set_index('DateTime')
    df_weather = df_weather.tz_localize('UTC')
    df_weather = df_weather.reset_index()
    
    dts = createSpaciatedTime()
    
    [datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00') for x in dts]
    
    df_weather_new = buildNewWeather(dts)
    
    df_weather_new = fillNewWeather(df_weather_new, df_weather, verbose = True)
    
    
    return df_weather_new.dropna()

In [111]:
df_weather = loadAndConvertWeather()

----------------- Program to fill the weather -----------------

The program has done 0 iterations
time taken till now:  0.01299595832824707 seconds



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


The program has done 5000 iterations
time taken till now:  6.891345739364624 seconds

The program has done 10000 iterations
time taken till now:  13.733802556991577 seconds

The program has done 15000 iterations
time taken till now:  20.546186685562134 seconds

The program has done 20000 iterations
time taken till now:  27.441372632980347 seconds

The program has done 25000 iterations
time taken till now:  34.265355825424194 seconds

The program has done 30000 iterations
time taken till now:  41.06945013999939 seconds

The program has done 35000 iterations
time taken till now:  47.84544587135315 seconds

The program has done 40000 iterations
time taken till now:  54.589452028274536 seconds

The program has done 45000 iterations
time taken till now:  61.325480461120605 seconds

The program has done 50000 iterations
time taken till now:  68.07650089263916 seconds

The program has done 55000 iterations
time taken till now:  74.81245851516724 seconds

The program has done 60000 iterations


In [124]:
df_weather.isnull().sum()

DateTime             0
temperature          0
relative_humidity    0
wind_direction       0
wind_speed           0
precipitation        0
sky_level            0
dtype: int64

# Merge dataframes

In [115]:
def mergeDataframes(df_taxi, df_weather, df_holidays, verbose = True):
    
    if verbose == True:
        time_start = time.time()
    
    df_taxi['DateTime'] = pd.to_datetime(df_taxi['trip_start_timestamp'], utc = True)
    
    df_taxi_holidays = df_taxi.merge(df_holidays, how = 'left', on = 'date')

    df_taxi_holidays_weather = df_taxi_holidays.merge(df_weather, how = 'left', on = 'DateTime')
    
    if verbose == True:
        print('time taken to merge Data:', time.time() - time_start, 'seconds')
    
    return df_taxi_holidays_weather
    
    
    

In [116]:
df_merged = mergeDataframes(df_taxi,df_weather, df_holidays, verbose = True)

time taken to merge Data: 10.236896753311157 seconds


In [121]:
len(df_merged)

7069319

In [120]:
df_merged.to_csv(r'C:/Users/Iconsense/abhishek/taxi/final_data.csv')