In [None]:
# Import all necessary libraries

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from math import radians, cos, sin, asin, sqrt
from pandas.tseries.holiday import USFederalHolidayCalendar

import seaborn as sns
import plotly.graph_objects as go
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.cluster import MiniBatchKMeans

In [None]:
# Read the train and test datasets
train_data = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/test-bbb2d.appspot.com/o/train.csv?alt=media&token=505676d4-dad3-43ac-bc96-643776d68b06')
test_data = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/test-bbb2d.appspot.com/o/test.csv?alt=media&token=4525c442-18d9-4d0a-afba-3b6fabf8db4b')

# INITIAL DATA PRE-PROCESSING 

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data[train_data.isna().any(axis=1)]

In [None]:
train_data.dtypes

In [None]:
# remove outliers for passenger_count 7,8,9,0
index = train_data[ (train_data['passenger_count'] == 7) | (train_data['passenger_count'] == 8) | (train_data['passenger_count'] == 9) | (train_data['passenger_count'] == 0)].index
train_data.drop(index , inplace=True)

In [None]:
# Remove outliers
mean = np.mean(train_data['trip_duration'])
print("Mean for Trip Duration is:", mean)

standard_deviation = np.std(train_data['trip_duration'])
print("Standard Deviation for Trip Duration is:", standard_deviation)

train_data = train_data[train_data['trip_duration'].between(mean - 2*standard_deviation, mean + 2*standard_deviation, inclusive = True)]
train_data.describe()

In [None]:
# Data Formatting

train_data['store_and_fwd_flag'] = train_data['store_and_fwd_flag'].map(lambda x: 0 if x == 'N' else 1)
test_data['store_and_fwd_flag'] = test_data['store_and_fwd_flag'].map(lambda x: 0 if x == 'N' else 1)

In [None]:
# Feature Extraction
# Decomposing timestamp for train data into date and time

train_data['pickup_datetime'] = pd.to_datetime(train_data.pickup_datetime)
train_data.loc[:, 'pickup_date'] = train_data['pickup_datetime'].dt.date
train_data.loc[:, 'pickup_time'] = train_data['pickup_datetime'].dt.time
train_data['dropoff_datetime'] = pd.to_datetime(train_data.dropoff_datetime)
train_data.loc[:, 'dropoff_date'] = train_data['dropoff_datetime'].dt.date
train_data.loc[:, 'dropoff_time'] = train_data['dropoff_datetime'].dt.time
train_data.loc[:, 'dropoff_hour'] = train_data['dropoff_datetime'].dt.hour
train_data.loc[:, 'dropoff_weekday'] = train_data['dropoff_datetime'].dt.weekday
train_data.loc[:, 'dropoff_month'] = train_data['dropoff_datetime'].dt.month
train_data.loc[:, 'pickup_weekday'] = train_data['pickup_datetime'].dt.weekday
train_data.loc[:, 'pickup_weekofyear'] = train_data['pickup_datetime'].dt.weekofyear
train_data.loc[:, 'pickup_hour'] = train_data['pickup_datetime'].dt.hour
train_data.loc[:, 'pickup_minute'] = train_data['pickup_datetime'].dt.minute
train_data.loc[:, 'pickup_dt'] = (train_data['pickup_datetime'] - train_data['pickup_datetime'].min()).dt.total_seconds()
train_data.loc[:, 'pickup_week_hour'] = train_data['pickup_weekday'] * 24 + train_data['pickup_hour']
train_data.loc[:, 'pickup_dayofyear'] = train_data['pickup_datetime'].dt.dayofyear
train_data.loc[:, 'pickup_month'] = train_data['pickup_datetime'].dt.month

In [None]:
# Feature Extraction
# Decomposing timestamp for test data into date and time

test_data['pickup_datetime'] = pd.to_datetime(test_data.pickup_datetime)
test_data.loc[:, 'pickup_date'] = test_data['pickup_datetime'].dt.date
test_data.loc[:, 'pickup_time'] = test_data['pickup_datetime'].dt.time
test_data.loc[:, 'pickup_weekday'] = test_data['pickup_datetime'].dt.weekday
test_data.loc[:, 'pickup_weekofyear'] = test_data['pickup_datetime'].dt.weekofyear
test_data.loc[:, 'pickup_hour'] = test_data['pickup_datetime'].dt.hour
test_data.loc[:, 'pickup_minute'] = test_data['pickup_datetime'].dt.minute
test_data.loc[:, 'pickup_dt'] = (test_data['pickup_datetime'] - test_data['pickup_datetime'].min()).dt.total_seconds()
test_data.loc[:, 'pickup_week_hour'] = test_data['pickup_weekday'] * 24 + test_data['pickup_hour']
test_data.loc[:, 'pickup_dayofyear'] = test_data['pickup_datetime'].dt.dayofyear
test_data.loc[:, 'pickup_month'] = test_data['pickup_datetime'].dt.month

In [None]:
# Feature Extraction
# Identifying holidays for train data from USFederalHolidayCalendar()

calendar = USFederalHolidayCalendar()
holidays = calendar.holidays()

train_data['pickup_holiday'] = pd.to_datetime(train_data.pickup_datetime.dt.date).isin(holidays)
train_data['pickup_holiday'] = train_data.pickup_holiday.map(lambda x: 1 if x == True else 0)
train_data['pickup_near_holiday'] = (pd.to_datetime(train_data.pickup_datetime.dt.date).isin(holidays + timedelta(days=1)) | pd.to_datetime(train_data.pickup_datetime.dt.date).isin(holidays - timedelta(days=1)))
train_data['pickup_near_holiday'] = train_data.pickup_near_holiday.map(lambda x: 1 if x == True else 0)

In [None]:
# Identifying holidays for test data from USFederalHolidayCalendar()

test_data['pickup_holiday'] = pd.to_datetime(test_data.pickup_datetime.dt.date).isin(holidays)
test_data['pickup_holiday'] = test_data.pickup_holiday.map(lambda x: 1 if x == True else 0)
test_data['pickup_near_holiday'] = (pd.to_datetime(test_data.pickup_datetime.dt.date).isin(holidays + timedelta(days=1)) | pd.to_datetime(test_data.pickup_datetime.dt.date).isin(holidays - timedelta(days=1)))
test_data['pickup_near_holiday'] = test_data.pickup_near_holiday.map(lambda x: 1 if x == True else 0)

### COMPUTE DISTANCES (Using Longitudes & Latitudes)

In order of increasing importance/accuracy:
(Haversine Distance = Bearing Distance) < Manhattan Distance < OSRM Dataset Distance < Google Distance API

In [None]:
# Compute Distances using Longitudes & Latitudes 

# 1 Haversine Distance
def haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371 
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

# 2 Bearing Distance 
def bearing_direction(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

# 3 Manhattan Distance
def manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_distance(lat1, lng1, lat1, lng2)
    b = haversine_distance(lat1, lng1, lat2, lng1)
    return a + b


In [None]:
train_data.loc[:, 'direction'] = bearing_direction(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.loc[:, 'distance_haversine'] = haversine_distance(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.loc[:, 'distance_manhattan'] = manhattan_distance(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.loc[:, 'center_latitude'] = (train_data['pickup_latitude'].values + train_data['dropoff_latitude'].values) / 2
train_data.loc[:, 'center_longitude'] = (train_data['pickup_longitude'].values + train_data['dropoff_longitude'].values) / 2

In [None]:
test_data.loc[:, 'direction'] = bearing_direction(test_data['pickup_latitude'].values, test_data['pickup_longitude'].values, test_data['dropoff_latitude'].values, test_data['dropoff_longitude'].values)
test_data.loc[:, 'distance_haversine'] = haversine_distance(test_data['pickup_latitude'].values, test_data['pickup_longitude'].values, test_data['dropoff_latitude'].values, test_data['dropoff_longitude'].values)
test_data.loc[:, 'distance_manhattan'] = manhattan_distance(test_data['pickup_latitude'].values, test_data['pickup_longitude'].values, test_data['dropoff_latitude'].values, test_data['dropoff_longitude'].values)
test_data.loc[:, 'center_latitude'] = (test_data['pickup_latitude'].values + test_data['dropoff_latitude'].values) / 2
test_data.loc[:, 'center_longitude'] = (test_data['pickup_longitude'].values + test_data['dropoff_longitude'].values) / 2

In [None]:
google_distance_dataset = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/pizzarush-40b2f.appspot.com/o/train_with_distance848k.csv?alt=media&token=48501761-2b33-4508-9589-4ceb2eabe33d')
google_distance_dataset.head()

**Google Distance API**

The distance between each longitude and latitude of every record is calculated here using the Google Maps Distance Matrix API.
The code has been commented here to avoid running the code snippet again due to request restrictions of the *free tier API Key*, after scraping the distance for all of the 1.5 million datapoints.

In [None]:
# # 4 Google Distance API

# import requests

# MAPS_KEY = 'AIzaSyB-f9LXy03l1HgMOG9LAXuI1ZXd0eP1pxw'

# pickup_longitude = train_data['pickup_longitude'][0]
# pickup_latitude = train_data['pickup_latitude'][0]

# dropoff_longitude = train_data['dropoff_longitude'][0]
# dropoff_latitude = train_data['dropoff_latitude'][0]

# origin_str = f'{pickup_latitude},{pickup_longitude}'
# destination_str = f'{dropoff_latitude},{dropoff_longitude}'

# url ='https://maps.googleapis.com/maps/api/distancematrix/json?'

# response=requests.get(url + 'origins=' + origin_str +
#                    '&destinations=' + destination_str +
#                    '&key=' + MAPS_KEY)

# print(response.json())

In [None]:
# # Make API calls in batches of 1000 train instance records 

# start = 1 
# end = 1000

# for i in range(start, end+1):        
#     pickup_longitude = df['pickup_longitude'][i]
#     pickup_latitude = df['pickup_latitude'][i]

#     dropoff_longitude = df['dropoff_longitude'][i]
#     dropoff_latitude = df['dropoff_latitude'][i]

#     origin_str = f'{pickup_latitude},{pickup_longitude}'
#     destination_str = f'{dropoff_latitude},{dropoff_longitude}'

#     url ='https://maps.googleapis.com/maps/api/distancematrix/json?'
    
#     try:
#         response=requests.get(url + 'origins=' + origin_str +
#                        '&destinations=' + destination_str +
#                        '&key=' + MAPS_KEY)
        
#         distance = response.json()['rows'][0]['elements'][0]['distance']['value']
#     except:
#         distance = None
    
#     print(f'distance... {i} = {distance}')
#     df['google_distance'][i] = distance

In [None]:
google_distance = google_distance_dataset[:848000]['google_distance']
print(google_distance)

In [None]:
plt.figure()

x = list(range(848000))
y1 = train_data[:848000]['direction']
y2 = train_data[:848000]['distance_haversine']
y3 = train_data[:848000]['distance_manhattan']
y4 = google_distance

plt.plot(x,y1)
plt.plot(x,y2)
plt.plot(x, y3)
plt.plot(x, y4)

In [None]:
train_data[['direction', 'distance_haversine', 'distance_manhattan']].plot()

In [None]:
train_data.head()

# UNI-VARIATE ANALYSIS

In [None]:
# Finding number of trips with 0 travelled distance which come out to be 5894.

train_data[(train_data['distance_haversine']==0)].count()

In [None]:
# Numeric Variables: Visualising using Box Plot, Dist Plot, Violin Plot 
# pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, trip_duration

numeric_data = pd.DataFrame(train_data[[
                                        'pickup_longitude',	
                                        'pickup_latitude',	
                                        'dropoff_longitude',	
                                        'dropoff_latitude',	
                                        'trip_duration',	
                                        ]])

f, axes = plt.subplots(5, 3, figsize=(18, 30))
colors = ["r", "g", "b", "m", "c"]

count = 0
for var in numeric_data:
    sns.boxplot(numeric_data[var], orient = "h", color = colors[count], ax = axes[count,0])
    sns.distplot(numeric_data[var], color = colors[count], ax = axes[count,1])
    sns.violinplot(numeric_data[var], color = colors[count], ax = axes[count,2])
    count += 1

In [None]:
# Categorical Variables: Visualising using Violin Plots
# vendor_id, passenger_count, store_and_fwd_flag

f, axes = plt.subplots(3, 1, figsize=(16,8))

sns.violinplot(x = 'vendor_id', y = 'trip_duration', data = train_data, ax= axes[0])
sns.violinplot(x = 'passenger_count', y = 'trip_duration', data = train_data, ax= axes[1])
sns.violinplot(x = 'store_and_fwd_flag', y = 'trip_duration', data = train_data, ax= axes[2])

In [None]:
# Store and Forward Flag Distribution

sns.distplot(train_data['store_and_fwd_flag'],kde=False)
plt.title('Distribution of Store and Forward flag Count')
plt.show()

train_data['store_and_fwd_flag'].value_counts()

In [None]:
# Passenger Count Distribution

sns.distplot(train_data['passenger_count'],kde=False)
plt.title('Distribution of Passenger Count')
plt.show()

In [None]:
# Count of occurences of each pickup day, pickup hour and pickup month

fig, sub = plt.subplots(1,3,figsize=(25,6))
counter = 0

for feat, subplot in zip(["pickup_weekday","pickup_hour", "pickup_month"], sub.flatten()):
    
    if counter < 3:
        sns.barplot(x=train_data[feat].value_counts().index, y = train_data[feat].value_counts().values, ax= subplot, palette="CMRmap")
        subplot.grid()
        subplot.set_title("Train set {}".format(feat))
    
    counter+=1
    
fig.tight_layout()

In [None]:
# Count of occurences of each dropoff weekday and dropoff hour 
# dropoff month has been excluded since the pickup month and dropoff month would be in the same month

fig, sub = plt.subplots(1,2,figsize=(25,6))
counter = 0

for feat, subplot in zip(["dropoff_weekday","dropoff_hour"], sub.flatten()):
    
    if counter < 2:
        sns.barplot(x=train_data[feat].value_counts().index, y = train_data[feat].value_counts().values, ax= subplot, palette="CMRmap")
        subplot.grid()
        subplot.set_title("Train set {}".format(feat))
    
    counter+=1
    
fig.tight_layout()

In [None]:
# Histogram to visualise counts for each Trip Duration 

graph = sns.histplot(train_data["trip_duration"], bins = 200)
graph.set(xlabel='Trip duration in seconds', ylabel='Trip Count')

In [None]:
# Histogram to visualise counts for each Trip Duration, with logarithmic normalisation/scaling

train_data['trip_duration_normalised'] = np.log(train_data['trip_duration'].values + 1)
graph = sns.histplot(train_data["trip_duration_normalised"], bins = 200)
graph.set(xlabel='Trip duration in seconds (Normalised)', ylabel='Trip Count')

In [None]:
# Visualise Trips Count by Pick Up date

fig = go.Figure()
pickup_date_count = pd.DataFrame(train_data.groupby('pickup_date').count()[['id']])
pickup_date_count.index.name = 'Pickup Dates'
pickup_date_count.reset_index(inplace=True)


fig.add_trace(go.Scatter(x=pickup_date_count['Pickup Dates'], y=pickup_date_count['id'],
                    mode='lines+markers',
                    name='lines+markers'))

fig.update_xaxes(title_text="Pickup Months")
fig.update_yaxes(title_text="Count of Trips")

# BI-VARIATE ANALYSIS

In [None]:
# Heap Map to visualise Feature Correlation

f, axes = plt.subplots(1, 1, figsize=(20, 20))
sns.heatmap(train_data.corr(), vmin = -1, vmax = 1, linewidths = 1,
           annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu")
f.set_figwidth(30)

In [None]:
# Average Trip Duration VS Store-and-Forward Flag and Average Trip Duration VS Vendor ID

fig, ax = plt.subplots(ncols=2, figsize=(12,4))

# Average Trip Duration VS Store-and-Forward Flag

store_and_fwd_flag_df = train_data.groupby('store_and_fwd_flag')['trip_duration'].mean()
plt.ylabel('Time in Seconds')
sns.barplot(store_and_fwd_flag_df.index, store_and_fwd_flag_df.values, ax=ax[0])
plt.setp(ax[0], ylabel='Time in Seconds')

# Average Trip Duration VS Vendor ID

vendor_id_df = train_data.groupby('vendor_id')['trip_duration'].mean()
plt.ylabel('Time in Seconds')
sns.barplot(vendor_id_df.index, vendor_id_df.values, ax=ax[1])


In [None]:
# Passenger Count VS Trip Duration time in seconds

passenger_count_df = train_data.groupby('passenger_count')['trip_duration'].mean()
plt.ylabel('Time in Seconds')
sns.barplot(passenger_count_df.index, passenger_count_df.values)

In [None]:
# Passenger Count VS Trip Distance

passenger_count_distance_df = train_data.groupby('passenger_count')['distance_manhattan'].mean()
plt.ylabel('Trip Distance in KM')
sns.barplot(passenger_count_distance_df.index, passenger_count_distance_df.values)

In [None]:
# Relationship b/w Vendor ID and Trip Duration

sns.catplot(x="vendor_id", y="trip_duration",kind="strip",data=train_data)

In [None]:
# Relationship b/w Trip Duration and Time of Day

ax=sns.catplot(x="pickup_hour", y="trip_duration",kind="bar",data=train_data)
plt.title('Distribution of pickup hours')
plt.show()

In [None]:
# Relationship b/w Trip Duration and Day of Week

sns.catplot(x="pickup_weekday",y="trip_duration",kind="bar",data=train_data,height=6,aspect=1)
plt.title('The Average Trip Duration per PickUp Day of the Week')

In [None]:
# Visualise PickUp Locations Density by plotting corresponding Longitudes/Latitudes

fig, ax = plt.subplots(ncols=1, sharex=True, sharey=True)
ax.scatter(train_data['pickup_longitude'].values[:625134], train_data['pickup_latitude'].values[:625134],
              color='red', s=1, label='train', alpha=0.5)

ax.title.set_text('Train coordinates')
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
plt.xlim(-74.05, -73.76)
plt.ylim(40.60, 40.90)
plt.show()

In [None]:
# Visualise main Neighbourhoods of New York City 

from sklearn.cluster import KMeans

coordinates_df = pd.DataFrame()
coordinates_df['all_longitude'] = list(train_data.pickup_longitude) + list(train_data.dropoff_longitude)
coordinates_df['all_latitude'] = list(train_data.pickup_latitude) + list(train_data.dropoff_latitude)

kmeans = KMeans(n_clusters=20, random_state=2, n_init = 10).fit(coordinates_df)
coordinates_df['kmeans_label'] = kmeans.labels_

coordinates_df = coordinates_df.sample(200000)
plt.figure(figsize = (10,10))
for label in coordinates_df.kmeans_label.unique():
    plt.plot(coordinates_df.all_longitude[coordinates_df.kmeans_label == label],coordinates_df.all_latitude[coordinates_df.kmeans_label == label],'.', alpha = 0.5, markersize = 0.5)

plt.title('Neighborhoods of New York City')
plt.xlim(-74.05, -73.76)
plt.ylim(40.60, 40.90)
plt.show()

In [None]:
# Use folium to visualize pickup and dropoff points in New York

import folium
f = folium.Figure(width=1500, height=500)
mapa = folium.Map(location = (40.7679, -73.9822), zoom_start=11).add_to(f)

for index, row in train_data.sample(1000).iterrows():
    folium.Marker([row["pickup_latitude"], row["pickup_longitude"]], icon=folium.Icon(color="blue")).add_to(mapa)
    folium.Marker([row["dropoff_latitude"], row["dropoff_longitude"]], icon=folium.Icon(color="red")).add_to(mapa)


display(mapa)

## Addtional Dataset: OSRM

In [None]:
# OSRM Dataset
fr1 = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/craftrip-594f5.appspot.com/o/fastest_routes_train_part_1.csv?alt=media&token=9b1832f8-2654-4835-9621-265e4f1f4c58', usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps', ])
fr2 = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/craftrip-594f5.appspot.com/o/fastest_routes_train_part_2.csv?alt=media&token=26cc43e2-a68f-447c-9c9b-80255bb620d1', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
test_street_info = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/craftrip-594f5.appspot.com/o/fastest_routes_test.csv?alt=media&token=dd7ff2fe-ea90-46d9-8189-e0defbdf8955',
                               usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])

In [None]:
fr1.head()

In [None]:
fr2.describe()

In [None]:
test_street_info.head()

OSRM: BIVARIATE ANALYSIS

In [None]:
# Concatenate both the OSRM datasets

train_street_info = pd.concat((fr1, fr2))
numeric_data = train_street_info[['total_distance', 'total_travel_time', 'number_of_steps']]
# sns.pairplot(data = numeric_data)

In [None]:
# Merge the train and test dataset with the OSRM Dataset
train_data = train_data.merge(train_street_info, how='left', on='id')
test_data = test_data.merge(test_street_info, how='left', on='id')

## Additional Dataset: Weather Dataset

In [None]:
# Read in the weather dataset
weather_data = pd.read_csv('https://firebasestorage.googleapis.com/v0/b/pizzarush-40b2f.appspot.com/o/weather_data_nyc_centralpark_2016(1).csv?alt=media&token=a6fb7ce8-b4bc-4bc5-ac07-132a078bea85') # Replace with firebase link when available
weather_data.head()

In [None]:
weather_data.describe()

In [None]:
weather_data.dtypes

WEATHER DATASET: FORMAT DATA

In [None]:
# Convert the data values to float which could be plotted and convert the string values containing 'T' for 'Trace' to 0.00

import matplotlib.pyplot as plt
%matplotlib inline

weather_data.loc[weather_data['precipitation']== 'T', 'precipitation'] = '0.00'
weather_data['precipitation'] = (weather_data['precipitation']).astype(float)

weather_data.loc[weather_data['snow fall']== 'T', 'snow fall'] = '0.00'
weather_data['snow fall'] = (weather_data['snow fall']).astype(float)

weather_data.loc[weather_data['snow depth']== 'T', 'snow depth'] = '0.00'
weather_data['snow depth'] = (weather_data['snow depth']).astype(float)

In [None]:
weather_data['date'] = pd.to_datetime(weather_data.date)
weather_data['weather_dayofyear']= weather_data.date.dt.dayofyear

WEATHER DATASET: UNIVARIATE ANALYSIS

In [None]:
# Plot min (yellow), avg (orange), and max (red) temperatures over time

import plotly.express as px
import plotly.graph_objects as go

# plotly.graph_objects
colors = ['yellow', 'orange', 'red']
fig = go.Figure()
fig.add_traces(go.Scatter(x=weather_data['date'], y = weather_data['minimum temperature'], mode = 'lines', line=dict(color=colors[0])))
fig.add_traces(go.Scatter(x=weather_data['date'], y = weather_data['average temperature'], mode = 'lines', line=dict(color=colors[1])))
fig.add_traces(go.Scatter(x=weather_data['date'], y = weather_data['maximum temperature'], mode = 'lines', line=dict(color=colors[2])))
fig.show()

In [None]:
# Plot precipitation, snow fall, and snow depth 

import plotly as py
import plotly.graph_objs as go
import plotly

x_data = weather_data['date'].values
y0_data = weather_data['precipitation']
y1_data = weather_data['snow fall']
y2_data = weather_data['snow depth']

data0 = go.Scatter(
    x = x_data,
    y = y0_data,
    mode = 'markers',
    name = 'precipitation'
)
data1 = go.Scatter(
    x = x_data,
    y = y1_data,
    mode = 'markers',
    name = 'snow fall'
)
data2 = go.Scatter(
    x = x_data,
    y = y2_data,
    mode = 'markers',
    name = 'snow depth'
)

data = [data0, data1, data2]
plotly.offline.iplot(data, filename='scatter-mode')

In [None]:
# Merge train dataset with the weather dataset through date column
train_data['date'] = train_data['pickup_datetime'].dt.date
train_data['date'] = pd.to_datetime(train_data['date'])
weather = weather_data[['date','minimum temperature', 'precipitation', 'snow fall', 'snow depth']]
train_data = train_data.merge(weather, how='left', left_on='date', right_on='date')

In [None]:
# Merge test dataset with the weather dataset through date column
test_data['date'] = test_data['pickup_datetime'].dt.date
test_data['date'] = pd.to_datetime(test_data['date'])
test_data = test_data.merge(weather, how='left', left_on='date', right_on='date')

# FEATURE SELECTION

In [None]:
# PCA - Reducing the dimentionality of the features

from sklearn.decomposition import PCA

coords = np.vstack((train_data[['pickup_latitude', 'pickup_longitude']].values,
                    train_data[['dropoff_latitude', 'dropoff_longitude']].values,
                    test_data[['pickup_latitude', 'pickup_longitude']].values,
                    test_data[['dropoff_latitude', 'dropoff_longitude']].values))

pca = PCA().fit(coords) # fit the PCA model according to the coordinate values

In [None]:
train_data['pickup_pca0'] = pca.transform(train_data[['pickup_latitude', 'pickup_longitude']])[:, 0] # Derive the PCA feature for Pickup Latitude in Train Dataset
train_data['pickup_pca1'] = pca.transform(train_data[['pickup_latitude', 'pickup_longitude']])[:, 1] # Derive the PCA feature for Pickup Longitude in Train Dataset
train_data['dropoff_pca0'] = pca.transform(train_data[['dropoff_latitude', 'dropoff_longitude']])[:, 0] # Derive the PCA feature for Dropoff Latitude in Train Dataset
train_data['dropoff_pca1'] = pca.transform(train_data[['dropoff_latitude', 'dropoff_longitude']])[:, 1] # Derive the PCA feature for Dropoff Longitude in Train Dataset
test_data['pickup_pca0'] = pca.transform(test_data[['pickup_latitude', 'pickup_longitude']])[:, 0] # Derive the PCA feature for Pickup Latitude in Test Dataset
test_data['pickup_pca1'] = pca.transform(test_data[['pickup_latitude', 'pickup_longitude']])[:, 1] # Derive the PCA feature for Pickup Longitude in Test Dataset
test_data['dropoff_pca0'] = pca.transform(test_data[['dropoff_latitude', 'dropoff_longitude']])[:, 0] # Derive the PCA feature for Dropoff Latitude in Test Dataset
test_data['dropoff_pca1'] = pca.transform(test_data[['dropoff_latitude', 'dropoff_longitude']])[:, 1] # Derive the PCA feature for Dropoff Longitutde in Test Dataset

train_data.loc[:, 'pca_manhattan'] = np.abs(train_data['dropoff_pca1'] - train_data['pickup_pca1']) + np.abs(train_data['dropoff_pca0'] - train_data['pickup_pca0']) # Calculate the Manhattan Distance with PCA Coordinates for Train Dataset
test_data.loc[:, 'pca_manhattan'] = np.abs(test_data['dropoff_pca1'] - test_data['pickup_pca1']) + np.abs(test_data['dropoff_pca0'] - test_data['pickup_pca0']) #Calculate the Manhattan Distance with PCA Coordinated for Test Dataset

In [None]:
# Visualising a PCA Feature - Pickup Longitude (pickup_pca1) from Train Dataset

ax = plt.gca()

train_data.reset_index().plot(kind='line', x='index', y='pickup_longitude',ax=ax)
train_data.reset_index().plot(kind='line', x='index', y='pickup_pca1', color='red', ax=ax)

plt.show()

# CLUSTERING

In [None]:
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

In [None]:
train_data.loc[:, 'pickup_cluster'] = kmeans.predict(train_data[['pickup_latitude', 'pickup_longitude']])
train_data.loc[:, 'dropoff_cluster'] = kmeans.predict(train_data[['dropoff_latitude', 'dropoff_longitude']])
test_data.loc[:, 'pickup_cluster'] = kmeans.predict(test_data[['pickup_latitude', 'pickup_longitude']])
test_data.loc[:, 'dropoff_cluster'] = kmeans.predict(test_data[['dropoff_latitude', 'dropoff_longitude']])

# MODEL TRAINING

In [None]:
# Listing out the features that should be used for training the XGB Model.

feature_names = list(train_data.columns)
features_not_used = ['id', 'trip_duration_normalised', 'trip_duration', 'dropoff_datetime','dropoff_date','dropoff_hour',
                           'dropoff_month','dropoff_time','dropoff_weekday', 'pickup_date', 'pickup_datetime', 'date','pickup_time','pickup_month']
feature_names = [f for f in train_data.columns if f not in features_not_used]
train_data[feature_names].count()

In [None]:
# Performing k-fold splits

X = train_data[feature_names].values
y = np.log(train_data['trip_duration'].values + 1)  
kf = KFold(n_splits=10)
kf.get_n_splits(X)

print(kf)  

KFold(n_splits=10, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
# Setting the XGB Parameters
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
dtest = xgb.DMatrix(test_data[feature_names].values)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_parameters = {'min_child_weight': 10, 'eta': 0.04, 'colsample_bytree': 0.8, 'max_depth': 15,
            'subsample': 0.75, 'lambda': 2, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}

In [None]:
# Training the model

model = xgb.train(xgb_parameters, dtrain, 750, watchlist, early_stopping_rounds=250, maximize=False, verbose_eval=15)

# MODEL PREDICTION 

In [None]:
# Predicting the model on test data

y_test = model.predict(dtest)
y_pred = model.predict(dvalid)

In [None]:
# Check the number of rows of test data to be matching with predicted data.
if test_data.shape[0] == y_test.shape[0]:
  print('Number of rows for test data and predicted data are same')  
else:
  print('Error in Test Data Format')

# Convert predicted dataframe into a csv file for submission
test_data['trip_duration'] = np.exp(y_test) - 1
test_data[['id', 'trip_duration']].to_csv('final_submission.csv', index=False)

# Plot validation and test prediction mean
print('Valid prediction mean: %.3f' % y_pred.mean())
print('Test prediction mean: %.3f' % y_test.mean())
fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
sns.distplot(y_pred, ax=ax[0], color='red', label='validation prediction')
sns.distplot(y_test, ax=ax[1], color='blue', label='test prediction')
ax[0].legend(loc=0)
ax[1].legend(loc=0)
plt.show()