In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
import dask
import dask.dataframe as dd
from sklearn.metrics import mean_absolute_percentage_error

#### Objective

- We have divided the data into 30 clusters, now goal is to predict the demand in neighbouring region within 1 miles (or next 15mins).
- Now we need to prepare the dataset in way, for a given region and time interval (of 15 mins), we have count of pickups.
- if we just bin the dataset, there will be sudden pick and down (spiky nature) in dataset, this spiky nature can adversely impact the model predictions, thus we need to do smoothing on dataset, which can help is captuing the essence/trend of data. there are two ways to do smoothing - 
    - Moving Average (MA) - if we take a window of 3 days, it will do avg of current day and two previous day, but it will be giving equal weightage to all 3 days. more the window size, more smooth will be the average and visa versa.
    - EWMA - It do not provide equal weightage to each data point, it gives more weightage to current weightage and less to previous, and it can be controlled via Beta. if beta is high more weightage to current observation

In [27]:
df_jan_path = "../data/raw/yellow_tripdata_2016-01.csv"
df_feb_path = "../data/raw/yellow_tripdata_2016-02.csv"
df_mar_path = "../data/raw/yellow_tripdata_2016-03.csv"

# load the dataframes

df_jan = dd.read_csv(df_jan_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

df_feb = dd.read_csv(df_feb_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])


df_mar = dd.read_csv(df_mar_path, assume_missing=True, usecols= ['trip_distance', 'tpep_pickup_datetime', 'pickup_longitude',
       'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'fare_amount'], parse_dates=["tpep_pickup_datetime"])

In [28]:
df = dd.concat([df_jan, df_feb, df_mar], axis=0)

In [None]:
# removing outlier and wrong entires, which we detected in Outlier_Removal.ipynb

min_latitude = 40.60
max_latitude = 40.85
min_longitude = -74.05
max_longitude = -73.70

min_fare = 0.50 
max_fare = 125 

min_distance = 0.25 
max_distance = 27

In [30]:
## removing outliers

df_final = (
    df
    .loc[
        (
            # removing coordinate that are not inside/on bounding box 
            df['pickup_latitude'].between(min_latitude, max_latitude, inclusive='both') & 
            df['pickup_longitude'].between(min_longitude, max_longitude, inclusive='both') &
            df['dropoff_latitude'].between(min_latitude, max_latitude, inclusive='both') &
            df['dropoff_longitude'].between(min_longitude, max_longitude, inclusive='both') &

            # removing outliers present in fare and trip distance
            df['fare_amount'].between(min_fare, max_fare, inclusive='both') &
            df['trip_distance'].between(min_distance, max_distance, inclusive='both')
        ),
        :
    ]
)

In [None]:
# removing not useful columns

df_final = df_final.drop(columns=['trip_distance', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount'])

In [None]:
df_final.to_csv('../data/interim/processing_data.csv', single_file=True, index=False)

['c:\\Users\\aksha\\OneDrive\\Desktop\\Urban Fleet Equilibrium Engine via Dynamic Geo-Clustering\\data\\interim\\processing_data.csv']

In [13]:
def read_csv():
    return pd.read_csv('c:\\Users\\aksha\\OneDrive\\Desktop\\Urban Fleet Equilibrium Engine via Dynamic Geo-Clustering\\data\\interim\\processing_data.csv', chunksize=100000, usecols=['pickup_latitude', 'pickup_longitude'])

In [None]:
scaler = StandardScaler()

for chunk in read_csv():
    scaler.partial_fit(chunk)

kmeans = MiniBatchKMeans(n_clusters=30, n_init=10, random_state=42)

for chunk in read_csv():
    scaled_chunk = scaler.transform(chunk)
    kmeans.partial_fit(scaled_chunk)

In [33]:
location_df = df_final.iloc[:, 1:].compute()

In [35]:
scaled_location_df = scaler.transform(location_df)

In [None]:
# calculating cluster for each data points

cluster_predictions = kmeans.predict(scaled_location_df)

In [None]:
# adding a column 'region' to assign the cluster predictions

time_series_data = df_final['tpep_pickup_datetime'].compute()

In [45]:
time_series_data = time_series_data.to_frame()

In [47]:
time_series_data['region'] = cluster_predictions

In [None]:
time_series_data.to_csv('../data/interim/time_series.csv', index=False)

In [50]:
time_series_data.set_index('tpep_pickup_datetime', inplace=True)

In [51]:
region_grp = time_series_data.groupby('region')

In [None]:
# group by 15 min interval (1-1.5 miles)
resampled_data = (
    region_grp['region']
    .resample('15min')
    .count()
)

resampled_data

region  tpep_pickup_datetime
0       2016-01-01 00:00:00     186
        2016-01-01 00:15:00     496
        2016-01-01 00:30:00     508
        2016-01-01 00:45:00     470
        2016-01-01 01:00:00     489
                               ... 
29      2016-03-31 22:45:00      74
        2016-03-31 23:00:00      67
        2016-03-31 23:15:00      67
        2016-03-31 23:30:00      60
        2016-03-31 23:45:00      70
Name: region, Length: 262080, dtype: int64

In [None]:
# renaming the col name

resampled_data.name = 'total_pickups'

In [54]:
resampled_data = resampled_data.reset_index(level=0)

In [55]:
resampled_data

Unnamed: 0_level_0,region,total_pickups
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01 00:00:00,0,186
2016-01-01 00:15:00,0,496
2016-01-01 00:30:00,0,508
2016-01-01 00:45:00,0,470
2016-01-01 01:00:00,0,489
...,...,...
2016-03-31 22:45:00,29,74
2016-03-31 23:00:00,29,67
2016-03-31 23:15:00,29,67
2016-03-31 23:30:00,29,60


In [None]:
# there are certain combination of time interval and region, where no pickups happened, thus we are removing zero's with value of 10 pickups
# this is important as we want continuity in our dataset
(resampled_data['total_pickups'] == 0).sum()

np.int64(4489)

In [61]:
epsilon_val = 10

resampled_data.replace({'total_pickups': {0 : epsilon_val}}, inplace=True)

In [63]:
window = list(range(3,11,1))

def calculate_best_window(windows):
    for window in windows:
        ind = window - 1
        ypred = resampled_data['total_pickups'].rolling(window=window).mean().values[ind:]
        y = resampled_data['total_pickups'].values[ind:]
        error = mean_absolute_percentage_error(y, ypred)
        print(f'for window value {window}, the MAPE is {round(error, 2)}')

In [None]:
# lower window value leads to reduction in loss
calculate_best_window(window)

for window value 3, the MAPE is 0.22
for window value 4, the MAPE is 0.26
for window value 5, the MAPE is 0.3
for window value 6, the MAPE is 0.34
for window value 7, the MAPE is 0.38
for window value 8, the MAPE is 0.42
for window value 9, the MAPE is 0.46
for window value 10, the MAPE is 0.5


In [69]:
smoothing_values = np.arange(0.2,1,0.1)

def calculate_best_smoothing_value(values):
    y = resampled_data['total_pickups'].values
    for value in values:
        ypred = resampled_data['total_pickups'].ewm(alpha=value).mean()
        error = mean_absolute_percentage_error(y, ypred)
        print(f'for smoothing value {round(value, 2)}, the MAPE is {round(error, 2)}')

In [None]:
# lower beta values, is more suitable

calculate_best_smoothing_value(smoothing_values)

for smoothing value 0.2, the MAPE is 0.44
for smoothing value 0.3, the MAPE is 0.3
for smoothing value 0.4, the MAPE is 0.22
for smoothing value 0.5, the MAPE is 0.17
for smoothing value 0.6, the MAPE is 0.13
for smoothing value 0.7, the MAPE is 0.09
for smoothing value 0.8, the MAPE is 0.06
for smoothing value 0.9, the MAPE is 0.03


In [71]:
resampled_data["avg_pickups"] = resampled_data['total_pickups'].ewm(alpha=0.4).mean().round()

resampled_data

Unnamed: 0_level_0,region,total_pickups,avg_pickups
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01 00:00:00,0,186,186.0
2016-01-01 00:15:00,0,496,380.0
2016-01-01 00:30:00,0,508,445.0
2016-01-01 00:45:00,0,470,457.0
2016-01-01 01:00:00,0,489,471.0
...,...,...,...
2016-03-31 22:45:00,29,74,71.0
2016-03-31 23:00:00,29,67,69.0
2016-03-31 23:15:00,29,67,68.0
2016-03-31 23:30:00,29,60,65.0


In [None]:
# saving the dataset

resampled_data.to_csv('../data/interim/final_data.csv', index=True)