In [1]:
# Importing libraries
from fastai.imports import *
from fastai.structured import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

from IPython.display import display

import os

In [2]:
PATH = 'data'

types = {'fare_amount':'float32',
        'pickup_longitude':'float32',
        'pickup_latitude':'float32',
        'dropoff_longitude':'float32',
        'dropoff_latitude':'float32',
        'passenger_count':'int32',
         'pickup_datetime':'object'
        }
cols_to_use = ['fare_amount','pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count']

In [3]:
# Importing data

%time df_raw = pd.read_csv(f'{PATH}/train.csv', nrows=12000000,usecols=cols_to_use, dtype = types)

Wall time: 22.8 s


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000000 entries, 0 to 11999999
Data columns (total 7 columns):
fare_amount          float32
pickup_datetime      object
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count      int32
dtypes: float32(5), int32(1), object(1)
memory usage: 366.2+ MB


In [5]:
# Function to set display options
def display_all(df):
    with pd.option_context('display.max_rows',1000):
        with pd.option_context('display.max_columns',1000):
            display(df)

## Exploration

In [6]:
df_raw.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,12000000.0,12000000.0,12000000.0,11999920.0,11999920.0,12000000.0
mean,11.33842,-72.51044,39.91973,-72.51036,39.92025,1.684948
std,9.797611,12.91242,9.262699,12.77134,9.347701,1.325028
min,-107.75,-3439.245,-3492.264,-3426.601,-3488.08,0.0
25%,6.0,-73.99207,40.73491,-73.99139,40.73404,1.0
50%,8.5,-73.98181,40.75264,-73.98015,40.75316,1.0
75%,12.5,-73.96709,40.76712,-73.96368,40.7681,2.0
max,1273.31,3457.626,3344.459,3457.622,3400.392,208.0


Remove null values.

In [7]:
df_raw.isnull().sum()

fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    79
dropoff_latitude     79
passenger_count       0
dtype: int64

In [8]:
df_raw.dropna(axis=0, how='any', inplace=True)

Remove rows with no fare_amount

In [9]:
df_raw = df_raw[df_raw.fare_amount>=0]

Let's look at the passenger_count feature.

In [10]:
df_raw.passenger_count.unique()

array([  1,   2,   3,   6,   5,   4,   0, 208,   9, 129,   7,  51,  49,   8], dtype=int64)

In [11]:
df_raw[df_raw.passenger_count>10].passenger_count.value_counts()

208    12
49      1
51      1
129     1
Name: passenger_count, dtype: int64

In [12]:
df_raw.groupby(['passenger_count']).mean()

Unnamed: 0_level_0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
passenger_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8.995379,-72.725739,40.06295,-72.679443,40.038246
1,11.211963,-72.49688,39.916065,-72.494614,39.914429
2,11.818702,-72.589211,39.961147,-72.601791,39.971985
3,11.537989,-72.568428,39.952335,-72.594856,39.96928
4,11.772282,-72.603424,39.965912,-72.629303,39.983978
5,11.214808,-72.403519,39.840759,-72.389893,39.839211
6,12.112725,-72.526314,39.883556,-72.469566,39.841652
7,9.833333,-73.991821,40.739483,-73.985275,40.727016
8,31.915001,-73.893517,40.708809,-73.99794,40.740963
9,57.290001,-73.953796,40.747158,-73.99778,40.751263


In [13]:
df_raw.passenger_count.value_counts()

1      8300324
2      1771399
5       849612
3       526940
4       254719
6       254131
0        42279
208         12
9            3
7            3
8            2
129          1
51           1
49           1
Name: passenger_count, dtype: int64

With so many 0 passenger_values and a non-null fare_amount mean for them, we should not remove the rides with 0 values.

In [14]:
df_raw = df_raw[df_raw.passenger_count<7]

## Feature Engineering

In [15]:
%time add_datepart(df_raw,'pickup_datetime',drop=True, time=True)

Wall time: 1min 10s


In [16]:
display_all(df_raw.head(5).T)

Unnamed: 0,0,1,2,3,4
fare_amount,4.5,16.9,5.7,7.7,5.3
pickup_longitude,-73.8443,-74.016,-73.9827,-73.9871,-73.9681
pickup_latitude,40.7213,40.7113,40.7613,40.7331,40.768
dropoff_longitude,-73.8416,-73.9793,-73.9912,-73.9916,-73.9567
dropoff_latitude,40.7123,40.782,40.7506,40.7581,40.7838
passenger_count,1,1,2,1,1
pickup_datetimeYear,2009,2010,2011,2012,2010
pickup_datetimeMonth,6,1,8,4,3
pickup_datetimeWeek,25,1,33,16,10
pickup_datetimeDay,15,5,18,21,9


In [17]:
def distance_traversed(lat1,lon1,lat2,lon2):
    '''Harvesine formula'''
    radius = 6371 # km
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d

In [18]:
def airport(name,lat1,lon1,lat2,lon2):
    '''Function to determine if either pickup or dropoff is near an airport'''
    if name == 'nyc':
        long, lat = -74.0063889, 40.7141667
    elif name == 'jfk':    
        long, lat = -73.7822222222, 40.6441666667
    elif name == 'ewr':    
        long, lat = -74.175, 40.69
    elif name == 'lgr':    
        long, lat = -73.87, 40.77
    return min(distance_traversed(lat,long,lat2,lon2),distance_traversed(lat1,lon1,lat,long))<1.5

In [19]:
def features(data):
    data['longitutde_traversed'] = (data.dropoff_longitude - data.pickup_longitude).abs()
    data['latitude_traversed'] = (data.dropoff_latitude - data.pickup_latitude).abs()
    data['distance_traversed'] = np.vectorize(distance_traversed)(data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_nyc'] = np.vectorize(airport)('nyc',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_jfk'] = np.vectorize(airport)('jfk',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_ewr'] = np.vectorize(airport)('ewr',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_lgr'] = np.vectorize(airport)('lgr',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)

In [20]:
%time features(df_raw)

Wall time: 5min 7s


In [21]:
display_all(df_raw.head(2).T)

Unnamed: 0,0,1
fare_amount,4.5,16.9
pickup_longitude,-73.8443,-74.016
pickup_latitude,40.7213,40.7113
dropoff_longitude,-73.8416,-73.9793
dropoff_latitude,40.7123,40.782
passenger_count,1,1
pickup_datetimeYear,2009,2010
pickup_datetimeMonth,6,1
pickup_datetimeWeek,25,1
pickup_datetimeDay,15,5


In [22]:
#os.makedirs('tmp',exist_ok=True)
#df_raw.to_feather('tmp/raw')

## Validation Set

Let's create a validation set which is representative of our data. Let's split on the passenger_count variable.

In [22]:
df_raw.passenger_count.value_counts()/len(df_raw)

1    0.691728
2    0.147624
5    0.070805
3    0.043914
4    0.021228
6    0.021179
0    0.003523
Name: passenger_count, dtype: float64

In [23]:
%%time
split = StratifiedShuffleSplit(n_splits=1, test_size=10000, random_state=42)
for train_index, test_index in split.split(df_raw, df_raw.passenger_count):
    train_set = df_raw.loc[df_raw.index.intersection(train_index)]
    test_set = df_raw.loc[df_raw.index.intersection(test_index)]

Wall time: 10 s


In [24]:
train_set.passenger_count.value_counts()/len(train_set)

1    0.691728
2    0.147623
5    0.070806
3    0.043916
4    0.021226
6    0.021180
0    0.003523
Name: passenger_count, dtype: float64

In [25]:
test_set.passenger_count.value_counts()/len(test_set)

1    0.6919
2    0.1507
5    0.0693
3    0.0417
4    0.0234
6    0.0196
0    0.0034
Name: passenger_count, dtype: float64

## Data Visualization

In [26]:
#% matplotlib inline
#train_set.plot(kind='scatter', x='pickup_latitude',y='pickup_longitude',alpha=0.1)

## Outlier Detection

In [27]:
train_set.tail(5)

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetimeYear,pickup_datetimeMonth,pickup_datetimeWeek,pickup_datetimeDay,...,pickup_datetimeMinute,pickup_datetimeSecond,pickup_datetimeElapsed,longitutde_traversed,latitude_traversed,distance_traversed,near_nyc,near_jfk,near_ewr,near_lgr
11999399,14.9,-73.974144,40.757156,-73.991753,40.726082,5,2010,9,39,27,...,34,0,1285623240,0.017609,0.031075,3.760326,False,False,False,False
11999400,13.3,-73.981979,40.74078,-74.008232,40.704372,1,2010,11,45,9,...,27,0,1289316420,0.026253,0.036407,4.613409,True,False,False,False
11999401,8.5,-73.987808,40.749763,-73.984261,40.738449,1,2012,5,21,22,...,55,28,1337673328,0.003548,0.011314,1.293116,False,False,False,False
11999402,65.0,-73.966301,40.801571,-73.966301,40.801571,2,2012,11,44,2,...,38,33,1351852713,0.0,0.0,0.0,False,False,False,False
11999403,11.5,-73.976822,40.741741,-73.989296,40.718735,1,2013,2,8,19,...,37,9,1361309829,0.012474,0.023006,2.765716,False,False,False,False


In [28]:
train_set = train_set.reset_index().drop('index', axis=1)

In [29]:
train_set.tail(2)

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetimeYear,pickup_datetimeMonth,pickup_datetimeWeek,pickup_datetimeDay,...,pickup_datetimeMinute,pickup_datetimeSecond,pickup_datetimeElapsed,longitutde_traversed,latitude_traversed,distance_traversed,near_nyc,near_jfk,near_ewr,near_lgr
11988806,65.0,-73.966301,40.801571,-73.966301,40.801571,2,2012,11,44,2,...,38,33,1351852713,0.0,0.0,0.0,False,False,False,False
11988807,11.5,-73.976822,40.741741,-73.989296,40.718735,1,2013,2,8,19,...,37,9,1361309829,0.012474,0.023006,2.765716,False,False,False,False


In [30]:
outliers = []

# For each feature find the data points with extreme high or low values
for feature in ['longitutde_traversed','latitude_traversed','distance_traversed']:
    Q1 = np.percentile(train_set[feature],25,axis=0)
    Q3 = np.percentile(train_set[feature],75,axis=0)
    step = 10*(Q3-Q1)
    feature_outlier = train_set[~((train_set[feature] >= Q1 - step) & (train_set[feature] <= Q3 + step))]
    outliers += feature_outlier.index.tolist()


In [31]:
len(outliers)/len(train_set)

0.013389571340203296

In [32]:
train_set = train_set.drop(train_set.index[outliers]).reset_index(drop = True)
len(train_set)

11883603

In [33]:
y_train = train_set.fare_amount
X_train = train_set.drop('fare_amount', axis = 1)
y_test = test_set.fare_amount
X_test = test_set.drop('fare_amount', axis = 1)

Next, let's create a scorer function.

In [34]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_test), y_test),
                m.score(X_train, y_train), m.score(X_test, y_test)]
    print(res)

## Model Creation

In [35]:
set_rf_samples(40000)

In [36]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, n_jobs=-1, max_features=0.8)
%time m.fit(X_train, y_train)
print_score(m)

Wall time: 1min 16s
[4.132484983145871, 5.334562497581804, 0.790694861399391, 0.720448022693494]


## Predictions on test set

In [37]:
test_set = pd.read_csv(f'{PATH}/test.csv')

In [38]:
test_key = test_set.key
test_set.drop('key', axis=1, inplace=True)

In [39]:
add_datepart(test_set,'pickup_datetime',drop=True, time=True)
features(test_set)

In [40]:
test_predictions = m.predict(test_set)

In [41]:
submission = pd.DataFrame({'key': test_key,
                          'fare_amount': test_predictions})
submission.to_csv('submissions.csv',index=False)