In [1]:
import os

import seaborn as sns
import zipfile
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from geopy import distance
from datetime import datetime
%matplotlib inline

import folium
from folium.plugins import MarkerCluster


In [2]:
raw_data_train = pd.read_csv('data/nyc-taxi-trip-duration/train.zip', compression='zip', header=0, sep=',', quotechar='"')
raw_data_test = pd.read_csv('data/nyc-taxi-trip-duration/test.zip', compression='zip', header=0, sep=',', quotechar='"')

In [3]:
#first glimpse into to the data 
raw_data_train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
def _create_summary_table(raw_data_train):
    summary_table = pd.DataFrame(index =raw_data_train.columns)
    summary_table['types'] = raw_data_train.dtypes
    summary_table['unique_values'] = raw_data_train.apply(lambda col: len(col.unique()))
    summary_table['pct_unique_value']= summary_table['unique_values'] / raw_data_train.shape[0]
    summary_table['nan_values'] = raw_data_train.apply(lambda col: col.isna().sum())
    stats=raw_data_train.describe(include='all',datetime_is_numeric=True).T
    summary_table['min']=stats['min']
    summary_table['max']=stats['max']
    summary_table['mean']=stats['mean']
    summary_table['std']=stats['std']
    return summary_table


In [5]:
_create_summary_table(raw_data_train)

Unnamed: 0,types,unique_values,pct_unique_value,nan_values,min,max,mean,std
id,object,1458644,1.0,0,,,,
vendor_id,int64,2,1e-06,0,1.0,2.0,1.53495,0.498777
pickup_datetime,object,1380222,0.946236,0,,,,
dropoff_datetime,object,1380377,0.946343,0,,,,
passenger_count,int64,10,7e-06,0,0.0,9.0,1.66453,1.314242
pickup_longitude,float64,23047,0.0158,0,-121.933342,-61.335529,-73.973486,0.070902
pickup_latitude,float64,45245,0.031019,0,34.359695,51.881084,40.750921,0.032881
dropoff_longitude,float64,33821,0.023187,0,-121.933304,-61.335529,-73.973416,0.070643
dropoff_latitude,float64,62519,0.042861,0,32.181141,43.921028,40.7518,0.035891
store_and_fwd_flag,object,2,1e-06,0,,,,


In [None]:
_create_summary_table(raw_data_test)

### What do we see here: 
- id -> there is no nan row 
- vendor -> seems to be well distributed 
- pickup_datetime/dropoff_datetime -> there are some days/hr/min with higher traffic, spans 5 month 
- passenger_count -> I would expect min value to be 1, fitting 9 people into a taxi seems to be challenging 
- latitude / longitude -> few unique values, maybe we can bin them 
- trip_duration(seconds) -> 1 second of trip? , 3526282 seconds of trip? 


In [None]:
train = raw_data_train.copy()
test = raw_data_test.copy()

In [None]:
def get_distance(row): 
    pick = (row.pickup_latitude, row.pickup_longitude)
    drop = (row.dropoff_latitude, row.dropoff_longitude)
    dist = distance.geodesic(pick, drop).km
    return dist

train['distance'] = train.apply(get_distance, axis=1)

In [None]:
# cleaning passengers with 0,7,8,9
train['passenger_count'].value_counts()
non_passenger = [0,7,8,9]
mask_passanger = train['passenger_count'].isin(non_passenger)
train.mask(mask_passanger,inplace=True)

In [None]:
three_hour = 60*60*3
mask_hour = train['trip_duration']>three_hour
train.mask(mask_hour,inplace=True)

In [None]:
distance = 0
mask_distance = train['distance']==distance
train.mask(mask_distance,inplace=True)

In [None]:
plt.hist(train['trip_duration'],bins=100)

In [None]:
train['trip_duration']/(60*60)

In [None]:
train['speed_kmh']= train['distance']/(train['trip_duration']/(60*60))

In [None]:
speed_limit = 90
mask_speed = train['speed_kmh']>speed_limit
train.mask(mask_speed,inplace=True)

In [None]:
plt.hist(train['speed_kmh'],bins=5)

In [None]:
lat_range = [40.496748,40.913584]
long_range = [-74.258259,-73,733869]

In [None]:
mask_location_long

In [None]:
mask_location_long = (train['pickup_longitude']<long_range[0])|( train['pickup_longitude']>long_range[1])
# mask_location_lat =train['pickup_latitude']<pickup_latitude
temp = train[mask_location_long]


In [None]:
my_map = folium.Map(location =[41, -74.00] , zoom_start = 4)
for i in range(0,len(temp.dropna())):
        folium.Marker(
          location=[temp.dropna().iloc[i]['pickup_latitude'],temp.dropna().iloc[i]['pickup_longitude']],
          popup=temp.dropna().iloc[i]['id'],
       ).add_to(my_map)
my_map

In [None]:
mask_location_long = (train['dropoff_longitude']<long_range[0])|( train['dropoff_longitude']>long_range[1])
# mask_location_lat =train['pickup_latitude']<pickup_latitude
temp = train[mask_location_long]


new_map = folium.Map(location =[41, -74.00] , zoom_start = 4)
for i in range(0,len(temp.dropna())):
        folium.Marker(
          location=[temp.dropna().iloc[i]['dropoff_latitude'],temp.dropna().iloc[i]['dropoff_longitude']],
          popup=temp.dropna().iloc[i]['id'],
       ).add_to(new_map)
new_map

In [None]:
mask_location_long = (train['dropoff_latitude']<lat_range[0])|( train['dropoff_latitude']>lat_range[1])
# mask_location_lat =train['pickup_latitude']<pickup_latitude
temp = train[mask_location_long]

new_map = folium.Map(location =[41, -74.00] , zoom_start = 4)
for i in range(0,len(temp.dropna())):
        folium.Marker(
          location=[temp.dropna().iloc[i]['dropoff_latitude'],temp.dropna().iloc[i]['dropoff_longitude']],
          popup=temp.dropna().iloc[i]['id'],
       ).add_to(new_map)
new_map

In [None]:
mask_location_long = (train['pickup_latitude']<lat_range[0])|( train['pickup_latitude']>lat_range[1])
# mask_location_lat =train['pickup_latitude']<pickup_latitude
temp = train[mask_location_long]

new_map = folium.Map(location =[41, -74.00] , zoom_start = 4)
for i in range(0,len(temp.dropna())):
        folium.Marker(
          location=[temp.dropna().iloc[i]['pickup_latitude'],temp.dropna().iloc[i]['pickup_longitude']],
          popup=temp.dropna().iloc[i]['id'],
       ).add_to(new_map)
new_map

In [None]:
#Plot pickup positions to visualize outliers
pickup_longitude = list(train.pickup_longitude)
pickup_latitude = list(train.pickup_latitude)
plt.subplots(figsize=(18,6))
plt.plot(pickup_longitude, pickup_latitude, '.', alpha = 1, markersize = 10)
plt.xlabel('pickup_longitude')
plt.ylabel('pickup_latitude')
plt.show()

In [None]:
pickup_longitude = -80
pickup_latitude = 39
train[train['pickup_longitude']<pickup_longitude]
train[train['pickup_latitude']<pickup_latitude]

In [None]:
#Plot dropoff positions to visualize outliers
dropoff_longitude = list(train.dropoff_longitude)
dropoff_latitude = list(train.dropoff_latitude)
plt.subplots(figsize=(18,6))
plt.plot(dropoff_longitude, dropoff_latitude, '.', alpha = 1, markersize = 10)
plt.xlabel('dropoff_longitude')
plt.ylabel('dropoff_latitude')
plt.show()

In [None]:
pickup_longitude = -80
pickup_latitude = 39

mask_location_long =train['pickup_longitude']<pickup_longitude
mask_location_lat =train['pickup_latitude']<pickup_latitude

train.mask(mask_location_long,inplace=True)
train.mask(mask_location_lat,inplace=True)

In [None]:
summary_table = _create_summary_table(train.dropna())

In [None]:
#feature engineering, bunlari pipeline icinde yapmak daha mantikli olabilir

date_columns = ['pickup_datetime','dropoff_datetime']
train[date_columns] = train[date_columns].apply(pd.to_datetime)
train['store_and_fwd_flag'] = train['store_and_fwd_flag'].astype(str)
train['store_and_fwd_flag'] = np.where(train['store_and_fwd_flag'] == "Y", 1,0)

train['pickup_datetime']= train['pickup_datetime'].apply(pd.to_datetime)
train['store_and_fwd_flag'] = train['store_and_fwd_flag'].astype(str)
train['store_and_fwd_flag'] = np.where(train['store_and_fwd_flag'] == "Y", 1,0)


# cleaning passengers with 0,7,8,9
train['passenger_count'].value_counts()
non_passenger = [0,7,8,9]
mask_passanger = train['passenger_count'].isin(non_passenger)
train.mask(mask_passanger,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

y_train = train.trip_duration
X_train = train.drop('trip_duration', axis = 1)

X_test = test



X_train.shape, y_train.shape, X_test.shape