In [1]:
# imports
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

try:
    import cPickle as p
except:
    import Pickle as p

In [12]:
trainData=pd.read_csv('../data/train.csv')
testData=pd.read_csv('../data/test.csv')

625134


In [11]:
def filterData(dataFrame):
    long_limit=[-74.257159, -73.699215]
    lat_limit=[40.495992, 40.915568]
    dataFrame=dataFrame[(dataFrame['pickup_longitude']>=long_limit[0])&(dataFrame['pickup_longitude']<=long_limit[1])]
    dataFrame=dataFrame[(dataFrame['pickup_latitude']>=lat_limit[0])&(dataFrame['pickup_latitude']<=lat_limit[1])]
    dataFrame=dataFrame[(dataFrame['dropoff_longitude']>=long_limit[0])&(dataFrame['dropoff_longitude']<=long_limit[1])]
    dataFrame=dataFrame[(dataFrame['dropoff_latitude']>=lat_limit[0])&(dataFrame['dropoff_latitude']<=lat_limit[1])]
    return dataFrame

trainData=trainData[trainData['trip_duration']>=60]
trainData=trainData[trainData['trip_duration']<1939736]

trainData=filterData(trainData)

625134


In [4]:
print trainData.columns.values

['id' 'vendor_id' 'pickup_datetime' 'dropoff_datetime' 'passenger_count'
 'pickup_longitude' 'pickup_latitude' 'dropoff_longitude'
 'dropoff_latitude' 'store_and_fwd_flag' 'trip_duration']


Lets begin by copying the columns which are not going to be changed.

In [5]:
train=pd.DataFrame(trainData[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','vendor_id']])
yTrain=pd.Series(trainData['trip_duration'])
test=pd.DataFrame(testData[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','vendor_id']])

In [6]:
train['store_and_fwd_flag']=[1 if x=='Y' else 0 for x in trainData['store_and_fwd_flag']]
test['store_and_fwd_flag']=[1 if x=='Y' else 0 for x in testData['store_and_fwd_flag']]
print train.columns.values

['pickup_longitude' 'pickup_latitude' 'dropoff_longitude'
 'dropoff_latitude' 'passenger_count' 'vendor_id' 'store_and_fwd_flag']


Now, lets deal with latitude and longitude features. I was confused on what is the proper way to encode these features and saw that [beluga](https://www.kaggle.com/gaborfodor/from-eda-to-the-top-lb-0-367) used PCA which kind of makes sense as PCA turns a set of correlated features into uncorrelated ones. I'll use it as well.

In [7]:
coords = np.vstack((train[['pickup_longitude', 'pickup_latitude']].values,
                    train[['dropoff_longitude', 'dropoff_latitude']].values,
                    test[['pickup_longitude', 'pickup_latitude']].values,
                    test[['dropoff_longitude', 'dropoff_latitude']].values))
pca = PCA().fit(coords)

In [8]:
train_pickup_pca=pca.transform(train[['pickup_longitude', 'pickup_latitude']])
train['pickup_pca0']=train_pickup_pca[:,0]
train['pickup_pca1']=train_pickup_pca[:,1]

train_dropoff_pca=pca.transform(train[['dropoff_longitude','dropoff_latitude']])
train['dropoff_pca0']=train_dropoff_pca[:,0]
train['dropoff_pca1']=train_dropoff_pca[:,1]

test_pickup_pca=pca.transform(test[['pickup_longitude', 'pickup_latitude']])
test['pickup_pca0']=test_pickup_pca[:,0]
test['pickup_pca1']=test_pickup_pca[:,1]

test_dropoff_pca=pca.transform(test[['dropoff_longitude','dropoff_latitude']])
test['dropoff_pca0']=test_dropoff_pca[:,0]
test['dropoff_pca1']=test_dropoff_pca[:,1]

Now, lets add distance between pickup and dropoff points. Note that I'll also add bearing as I mentioned in EDA.

In [13]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    (https://stackoverflow.com/questions/29545704/fast-haversine-approximation-python-pandas)
    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

train['dist_haversine']=haversine_np(train['pickup_longitude'],train['pickup_latitude'],train['dropoff_longitude'],train['dropoff_latitude'])
test['dist_haversine']=haversine_np(test['pickup_longitude'],test['pickup_latitude'],test['dropoff_longitude'],test['dropoff_latitude'])



We'll also add manhattan distances to it (because we've already assumed the surface to be flat enough to be considered euclidean and used kmeans on it).

In [14]:
def manhattan_dist(x1, y1, x2, y2):
    """
    Calculate manhattan distance between two points
    """
    return abs(x1-x2)+abs(y1-y2)

train['dist_pca_manhattan']=manhattan_dist(train['pickup_pca0'],train['pickup_pca1'],train['dropoff_pca0'],train['dropoff_pca1'])
test['dist_pca_manhattan']=manhattan_dist(test['pickup_pca0'],test['pickup_pca1'],test['dropoff_pca0'],test['dropoff_pca1'])


Let's add bearing now