In [1]:
# Importing libraries
from fastai.imports import *
from fastai.structured import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

from IPython.display import display

import os

In [2]:
PATH = 'data'

types = {'fare_amount':'float32',
        'pickup_longitude':'float32',
        'pickup_latitude':'float32',
        'dropoff_longitude':'float32',
        'dropoff_latitude':'float32',
        'passenger_count':'int32',
         'pickup_datetime':'object'
        }
cols_to_use = ['fare_amount','pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count']

In [3]:
# Importing data

%time df_raw = pd.read_csv(f'{PATH}/train.csv', nrows=20000000,usecols=cols_to_use, dtype = types)

Wall time: 39.6 s


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000000 entries, 0 to 19999999
Data columns (total 7 columns):
fare_amount          float32
pickup_datetime      object
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count      int32
dtypes: float32(5), int32(1), object(1)
memory usage: 610.4+ MB


In [5]:
# Function to set display options
def display_all(df):
    with pd.option_context('display.max_rows',1000):
        with pd.option_context('display.max_columns',1000):
            display(df)

## Exploration

In [6]:
df_raw.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,20000000.0,20000000.0,20000000.0,19999860.0,19999860.0,20000000.0
mean,11.34297,-72.51132,39.92074,-72.5106,39.91951,1.685312
std,16.89916,12.98955,9.388927,12.90617,9.570038,1.321177
min,-107.75,-3439.245,-3492.264,-3442.025,-3493.652,0.0
25%,6.0,-73.99207,40.73491,-73.9914,40.73403,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75315,1.0
75%,12.5,-73.96709,40.76712,-73.96368,40.76809,2.0
max,61550.86,3457.626,3406.008,3457.622,3400.392,208.0


Remove null values.

In [7]:
df_raw.isnull().sum()

fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude    139
dropoff_latitude     139
passenger_count        0
dtype: int64

In [8]:
df_raw.dropna(axis=0, how='any', inplace=True)

Remove rows with no fare_amount

In [9]:
df_raw = df_raw[df_raw.fare_amount>=0]

Let's look at the passenger_count feature.

In [10]:
df_raw.passenger_count.unique()

array([  1,   2,   3,   6,   5,   4,   0, 208,   9, 129,   7,  51,  49,   8,  34], dtype=int64)

In [11]:
df_raw[df_raw.passenger_count>10].passenger_count.value_counts()

208    15
51      1
49      1
34      1
129     1
Name: passenger_count, dtype: int64

In [12]:
df_raw.groupby(['passenger_count']).mean()

Unnamed: 0_level_0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
passenger_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8.992668,-72.672318,40.033203,-72.637901,40.015118
1,11.217378,-72.496201,39.916771,-72.494148,39.913269
2,11.824029,-72.594322,39.965149,-72.606537,39.973408
3,11.544141,-72.570137,39.954163,-72.589783,39.965214
4,11.779797,-72.596725,39.964191,-72.623253,39.978653
5,11.207222,-72.409363,39.838177,-72.388542,39.839916
6,12.119719,-72.547691,39.889427,-72.495483,39.849689
7,29.375,-74.038147,40.728302,-74.033234,40.718956
8,26.110001,-73.92984,40.719902,-73.986511,40.730785
9,37.473999,-73.964371,40.758514,-73.993561,40.757141


In [13]:
df_raw.passenger_count.value_counts()

1      13832062
2       2951606
5       1417620
3        878562
4        425043
6        423651
0         70454
208          15
9             5
7             4
8             3
129           1
51            1
49            1
34            1
Name: passenger_count, dtype: int64

With so many 0 passenger_values and a non-null fare_amount mean for them, we should not remove the rides with 0 values.

In [14]:
df_raw = df_raw[df_raw.passenger_count<7]

## Feature Engineering

In [15]:
%time add_datepart(df_raw,'pickup_datetime',drop=True, time=True)

Wall time: 1min 52s


In [16]:
display_all(df_raw.head(5).T)

Unnamed: 0,0,1,2,3,4
fare_amount,4.5,16.9,5.7,7.7,5.3
pickup_longitude,-73.8443,-74.016,-73.9827,-73.9871,-73.9681
pickup_latitude,40.7213,40.7113,40.7613,40.7331,40.768
dropoff_longitude,-73.8416,-73.9793,-73.9912,-73.9916,-73.9567
dropoff_latitude,40.7123,40.782,40.7506,40.7581,40.7838
passenger_count,1,1,2,1,1
pickup_datetimeYear,2009,2010,2011,2012,2010
pickup_datetimeMonth,6,1,8,4,3
pickup_datetimeWeek,25,1,33,16,10
pickup_datetimeDay,15,5,18,21,9


In [17]:
def distance_traversed(lat1,lon1,lat2,lon2):
    '''Harvesine formula'''
    radius = 6371 # km
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d

In [18]:
def airport(name,lat1,lon1,lat2,lon2):
    '''Function to determine if either pickup or dropoff is near an airport'''
    if name == 'nyc':
        long, lat = -74.0063889, 40.7141667
    elif name == 'jfk':    
        long, lat = -73.7822222222, 40.6441666667
    elif name == 'ewr':    
        long, lat = -74.175, 40.69
    elif name == 'lgr':    
        long, lat = -73.87, 40.77
    return min(distance_traversed(lat,long,lat2,lon2),distance_traversed(lat1,lon1,lat,long))<1.5

In [19]:
def features(data):
    data['longitutde_traversed'] = (data.dropoff_longitude - data.pickup_longitude).abs()
    data['latitude_traversed'] = (data.dropoff_latitude - data.pickup_latitude).abs()
    data['distance_traversed'] = np.vectorize(distance_traversed)(data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_nyc'] = np.vectorize(airport)('nyc',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_jfk'] = np.vectorize(airport)('jfk',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_ewr'] = np.vectorize(airport)('ewr',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)
    data['near_lgr'] = np.vectorize(airport)('lgr',data.pickup_latitude,data.pickup_longitude,data.dropoff_latitude,data.dropoff_longitude)

In [20]:
%time features(df_raw)

Wall time: 7min 59s


In [21]:
display_all(df_raw.head(2).T)

Unnamed: 0,0,1
fare_amount,4.5,16.9
pickup_longitude,-73.8443,-74.016
pickup_latitude,40.7213,40.7113
dropoff_longitude,-73.8416,-73.9793
dropoff_latitude,40.7123,40.782
passenger_count,1,1
pickup_datetimeYear,2009,2010
pickup_datetimeMonth,6,1
pickup_datetimeWeek,25,1
pickup_datetimeDay,15,5


In [23]:
#os.makedirs('tmp',exist_ok=True)
#df_raw.to_feather('tmp/raw')

## Validation Set

Let's create a validation set which is representative of our data. Let's split on the passenger_count variable.

In [24]:
df_raw.passenger_count.value_counts()/len(df_raw)

1    0.691638
2    0.147588
5    0.070885
3    0.043930
4    0.021253
6    0.021184
0    0.003523
Name: passenger_count, dtype: float64

In [39]:
split = StratifiedShuffleSplit(n_splits=1, test_size=10000, random_state=42)
for train_index, test_index in split.split(df_raw, df_raw.passenger_count):
    train_set = df_raw.loc[df_raw.index.intersection(train_index)]
    test_set = df_raw.loc[df_raw.index.intersection(test_index)]

MemoryError: 

In [26]:
train_set.passenger_count.value_counts()/len(train_set)

1.0    0.691604
2.0    0.147580
5.0    0.070881
3.0    0.043929
4.0    0.021252
6.0    0.021182
0.0    0.003522
Name: passenger_count, dtype: float64

In [27]:
test_set.passenger_count.value_counts()/len(test_set)

1    0.6895
2    0.1504
5    0.0702
3    0.0442
4    0.0214
6    0.0205
0    0.0038
Name: passenger_count, dtype: float64

## Data Visualization

In [28]:
#% matplotlib inline
#train_set.plot(kind='scatter', x='pickup_latitude',y='pickup_longitude',alpha=0.1)

## Outlier Detection

In [35]:
outliers = []

# For each feature find the data points with extreme high or low values
for feature in ['longitutde_traversed','latitude_traversed','distance_traversed']:
    Q1 = np.percentile(train_set[feature],25,axis=0)
    Q3 = np.percentile(train_set[feature],75,axis=0)
    step = 1.5*(Q3-Q1)
    feature_outlier = train_set[~((train_set[feature] >= Q1 - step) & (train_set[feature] <= Q3 + step))]
    outliers += feature_outlier.index.tolist()


  interpolation=interpolation)


In [36]:
len(outliers)/len(train_set)

3.0

In [None]:
df = df_raw.drop(df_raw.index[outliers]).reset_index(drop = True)
len(df)