In [1]:
# imports
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

try:
    import cPickle as p
except:
    import Pickle as p

In [2]:
trainData=pd.read_csv('../data/train.csv')
testData=pd.read_csv('../data/test.csv')

In [3]:
def filterData(dataFrame):
    long_limit=[-74.257159, -73.699215]
    lat_limit=[40.495992, 40.915568]
    dataFrame=dataFrame[(dataFrame['pickup_longitude']>=long_limit[0])&(dataFrame['pickup_longitude']<=long_limit[1])]
    dataFrame=dataFrame[(dataFrame['pickup_latitude']>=lat_limit[0])&(dataFrame['pickup_latitude']<=lat_limit[1])]
    dataFrame=dataFrame[(dataFrame['dropoff_longitude']>=long_limit[0])&(dataFrame['dropoff_longitude']<=long_limit[1])]
    dataFrame=dataFrame[(dataFrame['dropoff_latitude']>=lat_limit[0])&(dataFrame['dropoff_latitude']<=lat_limit[1])]
    return dataFrame

trainData=trainData[trainData['trip_duration']>=60]
trainData=trainData[trainData['trip_duration']<1939736]

trainData=filterData(trainData)

In [4]:
print trainData.columns.values

['id' 'vendor_id' 'pickup_datetime' 'dropoff_datetime' 'passenger_count'
 'pickup_longitude' 'pickup_latitude' 'dropoff_longitude'
 'dropoff_latitude' 'store_and_fwd_flag' 'trip_duration']


Lets begin by copying the columns which are not going to be changed.

In [5]:
train=pd.DataFrame(trainData[['id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','vendor_id']])
yTrain=pd.Series(trainData['trip_duration'])
test=pd.DataFrame(testData[['id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','vendor_id']])

In [6]:
train['store_and_fwd_flag']=[1 if x=='Y' else 0 for x in trainData['store_and_fwd_flag']]
test['store_and_fwd_flag']=[1 if x=='Y' else 0 for x in testData['store_and_fwd_flag']]
print train.columns.values

['id' 'pickup_longitude' 'pickup_latitude' 'dropoff_longitude'
 'dropoff_latitude' 'passenger_count' 'vendor_id' 'store_and_fwd_flag']


Now, lets deal with latitude and longitude features. I was confused on what is the proper way to encode these features and saw that [beluga](https://www.kaggle.com/gaborfodor/from-eda-to-the-top-lb-0-367) used PCA which kind of makes sense as PCA turns a set of correlated features into uncorrelated ones. I'll use it as well.

In [7]:
coords = np.vstack((train[['pickup_longitude', 'pickup_latitude']].values,
                    train[['dropoff_longitude', 'dropoff_latitude']].values,
                    test[['pickup_longitude', 'pickup_latitude']].values,
                    test[['dropoff_longitude', 'dropoff_latitude']].values))
pca = PCA().fit(coords)

In [8]:
train_pickup_pca=pca.transform(train[['pickup_longitude', 'pickup_latitude']])
train['pickup_pca0']=train_pickup_pca[:,0]
train['pickup_pca1']=train_pickup_pca[:,1]

train_dropoff_pca=pca.transform(train[['dropoff_longitude','dropoff_latitude']])
train['dropoff_pca0']=train_dropoff_pca[:,0]
train['dropoff_pca1']=train_dropoff_pca[:,1]

test_pickup_pca=pca.transform(test[['pickup_longitude', 'pickup_latitude']])
test['pickup_pca0']=test_pickup_pca[:,0]
test['pickup_pca1']=test_pickup_pca[:,1]

test_dropoff_pca=pca.transform(test[['dropoff_longitude','dropoff_latitude']])
test['dropoff_pca0']=test_dropoff_pca[:,0]
test['dropoff_pca1']=test_dropoff_pca[:,1]

Now, lets add distance between pickup and dropoff points. Note that I'll also add bearing as I mentioned in EDA.

In [9]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    (https://stackoverflow.com/questions/29545704/fast-haversine-approximation-python-pandas)
    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

train['dist_haversine']=haversine_np(train['pickup_longitude'],train['pickup_latitude'],train['dropoff_longitude'],train['dropoff_latitude'])
test['dist_haversine']=haversine_np(test['pickup_longitude'],test['pickup_latitude'],test['dropoff_longitude'],test['dropoff_latitude'])



We'll also add manhattan distances to it (because we've already assumed the surface to be flat enough to be considered euclidean and used kmeans on it).

In [10]:
def manhattan_dist(x1, y1, x2, y2):
    """
    Calculate manhattan distance between two points
    """
    return abs(x1-x2)+abs(y1-y2)

train['dist_pca_manhattan']=manhattan_dist(train['pickup_pca0'],train['pickup_pca1'],train['dropoff_pca0'],train['dropoff_pca1'])
test['dist_pca_manhattan']=manhattan_dist(test['pickup_pca0'],test['pickup_pca1'],test['dropoff_pca0'],test['dropoff_pca1'])


 Let's add bearing now (Formula from http://www.movable-type.co.uk/scripts/latlong.html)


In [11]:
def bearing(lon1,lat1,lon2,lat2):
    """
    Calculate bearing angle
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    y=np.sin(dlon)*np.cos(lat2)
    x=np.cos(lat1)*np.sin(lat2)-np.sin(lat1)*np.cos(lat2)*np.cos(dlon)
    return np.degrees(np.arctan2(y,x))

train['bearing']=bearing(train['pickup_longitude'],train['pickup_latitude'],train['dropoff_longitude'],train['dropoff_latitude'])
test['bearing']=bearing(test['pickup_longitude'],test['pickup_latitude'],test['dropoff_longitude'],test['dropoff_latitude'])

Now that we have all the latitude and distance calculations done, lets encode pickup and dropoff clusters too

In [12]:
kmeans=p.load(open('../pickle_dumps/kmeansDump.p'))

In [13]:
train_clusters=kmeans.predict(train[['pickup_longitude','pickup_latitude']])
enc=OneHotEncoder()
y=enc.fit_transform(train_clusters.reshape(-1,1))
y=y.toarray()
train['pickup_cluster_0']=y[:,0]
train['pickup_cluster_1']=y[:,1]
train['pickup_cluster_2']=y[:,2]
train['pickup_cluster_3']=y[:,3]
train['pickup_cluster_4']=y[:,4]
train['pickup_cluster_5']=y[:,5]
train['pickup_cluster_6']=y[:,6]
train['pickup_cluster_7']=y[:,7]
train['pickup_cluster_8']=y[:,8]
train['pickup_cluster_9']=y[:,9]
train['pickup_cluster_10']=y[:,10]
train['pickup_cluster_11']=y[:,11]
train['pickup_cluster_12']=y[:,12]
train['pickup_cluster_13']=y[:,13]
train['pickup_cluster_14']=y[:,14]
train['pickup_cluster_15']=y[:,15]
del y
del train_clusters

In [14]:
train_clusters=kmeans.predict(train[['dropoff_longitude','dropoff_latitude']])
enc=OneHotEncoder()
y=enc.fit_transform(train_clusters.reshape(-1,1))
y=y.toarray()
train['dropoff_cluster_0']=y[:,0]
train['dropoff_cluster_1']=y[:,1]
train['dropoff_cluster_2']=y[:,2]
train['dropoff_cluster_3']=y[:,3]
train['dropoff_cluster_4']=y[:,4]
train['dropoff_cluster_5']=y[:,5]
train['dropoff_cluster_6']=y[:,6]
train['dropoff_cluster_7']=y[:,7]
train['dropoff_cluster_8']=y[:,8]
train['dropoff_cluster_9']=y[:,9]
train['dropoff_cluster_10']=y[:,10]
train['dropoff_cluster_11']=y[:,11]
train['dropoff_cluster_12']=y[:,12]
train['dropoff_cluster_13']=y[:,13]
train['dropoff_cluster_14']=y[:,14]
train['dropoff_cluster_15']=y[:,15]
del y
del train_clusters

In [15]:
test_clusters=kmeans.predict(test[['pickup_longitude','pickup_latitude']])
enc=OneHotEncoder()
y=enc.fit_transform(test_clusters.reshape(-1,1))
y=y.toarray()
test['pickup_cluster_0']=y[:,0]
test['pickup_cluster_1']=y[:,1]
test['pickup_cluster_2']=y[:,2]
test['pickup_cluster_3']=y[:,3]
test['pickup_cluster_4']=y[:,4]
test['pickup_cluster_5']=y[:,5]
test['pickup_cluster_6']=y[:,6]
test['pickup_cluster_7']=y[:,7]
test['pickup_cluster_8']=y[:,8]
test['pickup_cluster_9']=y[:,9]
test['pickup_cluster_10']=y[:,10]
test['pickup_cluster_11']=y[:,11]
test['pickup_cluster_12']=y[:,12]
test['pickup_cluster_13']=y[:,13]
test['pickup_cluster_14']=y[:,14]
test['pickup_cluster_15']=y[:,15]
del y
del test_clusters

In [16]:
test_clusters=kmeans.predict(test[['dropoff_longitude','dropoff_latitude']])
enc=OneHotEncoder()
y=enc.fit_transform(test_clusters.reshape(-1,1))
y=y.toarray()
test['dropoff_cluster_0']=y[:,0]
test['dropoff_cluster_1']=y[:,1]
test['dropoff_cluster_2']=y[:,2]
test['dropoff_cluster_3']=y[:,3]
test['dropoff_cluster_4']=y[:,4]
test['dropoff_cluster_5']=y[:,5]
test['dropoff_cluster_6']=y[:,6]
test['dropoff_cluster_7']=y[:,7]
test['dropoff_cluster_8']=y[:,8]
test['dropoff_cluster_9']=y[:,9]
test['dropoff_cluster_10']=y[:,10]
test['dropoff_cluster_11']=y[:,11]
test['dropoff_cluster_12']=y[:,12]
test['dropoff_cluster_13']=y[:,13]
test['dropoff_cluster_14']=y[:,14]
test['dropoff_cluster_15']=y[:,15]
del y
del test_clusters
del kmeans

Now, lets handle the date time features. We'll handle cyclic features by taking their sine and cos values as mentioned at https://datascience.stackexchange.com/questions/5990/what-is-a-good-way-to-transform-cyclic-ordinal-attributes. I will also keep their linearity in mind

In [17]:
trainDT=pd.to_datetime(trainData.pickup_datetime)
testDT=pd.to_datetime(testData.pickup_datetime)

In [18]:
train['pickup_hour']=trainDT.dt.hour
train['pickup_hour_sin']=np.sin((np.pi*2*train['pickup_hour']*1.0)/24)
train['pickup_hour_cos']=np.cos((np.pi*2*train['pickup_hour']*1.0)/24)

test['pickup_hour']=testDT.dt.hour
test['pickup_hour_sin']=np.sin((np.pi*2*test['pickup_hour']*1.0)/24)
test['pickup_hour_cos']=np.cos((np.pi*2*test['pickup_hour']*1.0)/24)

In [19]:
train['pickup_dow']=trainDT.dt.dayofweek
train['pickup_dow_sin']=np.sin((np.pi*2*train['pickup_dow']*1.0)/7)
train['pickup_dow_cos']=np.cos((np.pi*2*train['pickup_dow']*1.0)/7)

test['pickup_dow']=testDT.dt.dayofweek
test['pickup_dow_sin']=np.sin((np.pi*2*test['pickup_dow']*1.0)/7)
test['pickup_dow_cos']=np.cos((np.pi*2*test['pickup_dow']*1.0)/7)

In [20]:
train['pickup_month']=trainDT.dt.month
train['pickup_month_sin']=np.sin((np.pi*2*train['pickup_month']*1.0)/12)
train['pickup_month_cos']=np.cos((np.pi*2*train['pickup_month']*1.0)/12)

test['pickup_month']=testDT.dt.month
test['pickup_month_sin']=np.sin((np.pi*2*test['pickup_month']*1.0)/12)
test['pickup_month_cos']=np.cos((np.pi*2*test['pickup_month']*1.0)/12)

In [21]:
train['pickup_week_year']=trainDT.dt.weekofyear
train['pickup_week_year_sin']=np.sin((np.pi*2*train['pickup_week_year']*1.0)/53)
train['pickup_week_year_cos']=np.cos((np.pi*2*train['pickup_week_year']*1.0)/53)

test['pickup_week_year']=testDT.dt.weekofyear
test['pickup_week_year_sin']=np.sin((np.pi*2*test['pickup_week_year']*1.0)/53)
test['pickup_week_year_cos']=np.cos((np.pi*2*test['pickup_week_year']*1.0)/53)

In [22]:
x=trainDT.apply(lambda x:1 if x.dayofweek>=4 and (x.hour<=6 or x.hour>=21) else 0)
train['party_night_traffic']=x
x=testDT.apply(lambda x:1 if x.dayofweek>=4 and (x.hour<=6 or x.hour>=21) else 0)
test['party_night_traffic']=x

In [23]:
x=trainDT.apply(lambda x:1 if x.dayofweek<=4 and (x.hour>=8 or x.hour<=19) else 0)
train['work_day_traffic']=x
x=testDT.apply(lambda x:1 if x.dayofweek<=4 and (x.hour>=8 or x.hour<=19) else 0)
test['work_day_traffic']=x

Now let us add features from other datasets. Lets add routing data from OSRM provided by oscarleo (https://www.kaggle.com/oscarleo/new-york-city-taxi-with-osrm)

In [24]:
tr_osrm_1=pd.read_csv('../data/fastest_routes_train_part_1.csv', usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps'])
tr_osrm_2=pd.read_csv('../data/fastest_routes_train_part_2.csv', usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps'])
tr_osrm=pd.concat((tr_osrm_1, tr_osrm_2))
te_osrm=pd.read_csv('../data/fastest_routes_test.csv',
                               usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])

train=train.merge(tr_osrm,how='left',on='id')
test=test.merge(te_osrm,how='left',on='id')

In [25]:
del train['id']
del test['id']

In [26]:
print train.columns.values
print len(train.columns.values)
print len(test.columns.values)

['pickup_longitude' 'pickup_latitude' 'dropoff_longitude'
 'dropoff_latitude' 'passenger_count' 'vendor_id' 'store_and_fwd_flag'
 'pickup_pca0' 'pickup_pca1' 'dropoff_pca0' 'dropoff_pca1' 'dist_haversine'
 'dist_pca_manhattan' 'bearing' 'pickup_cluster_0' 'pickup_cluster_1'
 'pickup_cluster_2' 'pickup_cluster_3' 'pickup_cluster_4'
 'pickup_cluster_5' 'pickup_cluster_6' 'pickup_cluster_7'
 'pickup_cluster_8' 'pickup_cluster_9' 'pickup_cluster_10'
 'pickup_cluster_11' 'pickup_cluster_12' 'pickup_cluster_13'
 'pickup_cluster_14' 'pickup_cluster_15' 'dropoff_cluster_0'
 'dropoff_cluster_1' 'dropoff_cluster_2' 'dropoff_cluster_3'
 'dropoff_cluster_4' 'dropoff_cluster_5' 'dropoff_cluster_6'
 'dropoff_cluster_7' 'dropoff_cluster_8' 'dropoff_cluster_9'
 'dropoff_cluster_10' 'dropoff_cluster_11' 'dropoff_cluster_12'
 'dropoff_cluster_13' 'dropoff_cluster_14' 'dropoff_cluster_15'
 'pickup_hour' 'pickup_hour_sin' 'pickup_hour_cos' 'pickup_dow'
 'pickup_dow_sin' 'pickup_dow_cos' 'pickup_month' 'pi

In [30]:
train.to_csv('../data/train_final.csv',index=False)
test.to_csv('../data/test_final.csv',index=False)