In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("./data/train.csv",parse_dates=[2,3])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(3), object(2)
memory usage: 122.4+ MB


In [4]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [5]:
# Obtain distance between two points on earth
# https://en.wikipedia.org/wiki/Great-circle_distance

# Below function gives distance in KM between to points on the earth surface identified by
# their longitude and latitudes (from_long, from_lat) & (to_long, to_lat)
# 0.009 of difference in x co-ordinate = 1.002 km
# 0.009 of difference in y co-ordinate = 1.002 km

from math import sin, cos, acos

def get_distance(from_long, from_lat, to_long, to_lat):
    
    if from_long == to_long and from_lat == to_lat: 
        return 0
    
    v_pi             = 3.1415926; 
    v_earth_radius   = 6378; # Radius of the Earth in km    
    v_from_x_radians = (v_pi / 180) * from_long
    v_from_y_radians = (v_pi / 180) * from_lat
    v_to_x_radians   = (v_pi / 180) * to_long
    v_to_y_radians   = (v_pi / 180) * to_lat

    v_distance = ((acos(sin(v_from_y_radians) * sin(v_to_y_radians) 
                       + (cos(v_from_y_radians) * cos(v_to_y_radians) *  cos(v_from_x_radians - v_to_x_radians)))) 
                  * v_earth_radius)
    return v_distance # Unit in kilometer

In [6]:
df['trip_distance'] = df.apply(lambda r: get_distance(r['pickup_longitude'], r['pickup_latitude'], r['dropoff_longitude'], r['dropoff_latitude']),
                         axis=1)

In [7]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.500167
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.807491
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.392114
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.487131
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.189894


#### The distance can be verified using https://www.distancefromto.net/

#### Bin the trip_distance

In [13]:
round(df.trip_distance,0).value_counts()

1.0       444335
2.0       356495
3.0       195111
4.0       111663
5.0        71053
6.0        49005
0.0        48782
7.0        32726
9.0        24931
8.0        23229
10.0       21328
11.0       11911
21.0       10357
20.0       10260
12.0        7567
14.0        6531
13.0        6374
19.0        4346
15.0        4096
22.0        4045
16.0        3973
18.0        3401
17.0        3318
23.0        1708
24.0         553
25.0         274
26.0         251
27.0         191
28.0         134
29.0          92
           ...  
576.0          1
320.0          1
55.0           1
892.0          1
173.0          1
117.0          1
115.0          1
113.0          1
215.0          1
105.0          1
102.0          1
98.0           1
255.0          1
94.0           1
91.0           1
86.0           1
84.0           1
83.0           1
315.0          1
1242.0         1
76.0           1
75.0           1
72.0           1
71.0           1
70.0           1
68.0           1
192.0          1
135.0         

#### We can bin the trip_distance into trips_between <0.5 km, >= 0.5 and < 1 km, >= 1km and < 1.5 km, 1.5 - 2, 2 - 2.5, 2.5 - 3, 3 - 3.5, 3.5 - 4, 4-4.5, 4.5 - 5, 5 - 6, 6 - 7, 7-8, 8-9, 9-10, 10-15,15-20 and greater than 20

In [18]:
df['td0'] = df.apply(lambda r: 1 if r['trip_distance'] < 0.5 else 0, axis=1 )

In [20]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_distance,td0
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.500167,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.807491,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.392114,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.487131,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.189894,0


In [21]:
df['td1'] = df.apply(lambda r: 1 if r['trip_distance'] >= 0.5 and r['trip_distance'] < 1.0 else 0, axis=1 )

In [23]:
df[df['td1'] > 0].head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_distance,td0,td1
13,id0799785,2,2016-06-01 20:58:29,2016-06-01 21:02:49,1,-73.956306,40.767941,-73.96611,40.763,N,260,0.992774,0,1
15,id3319787,1,2016-05-16 15:29:02,2016-05-16 15:32:33,1,-73.955513,40.768593,-73.948761,40.771545,N,211,0.657299,0,1
37,id2403238,1,2016-06-23 23:00:09,2016-06-23 23:04:46,1,-73.988068,40.728081,-73.980751,40.72147,N,277,0.96049,0,1
68,id1680350,1,2016-03-18 20:56:16,2016-03-18 20:59:51,1,-74.002983,40.723312,-74.001556,40.728825,N,215,0.625299,0,1
83,id1205949,1,2016-06-05 18:43:14,2016-06-05 18:46:52,1,-73.962608,40.758827,-73.954193,40.764053,N,218,0.91754,0,1


In [30]:
# Create columns td2 to td9 for steps of 0.5 km
d = 1
cn = 2
while d < 5 :
    print (d)
    col_name = 'td' + str(cn)
    print (col_name)
    
    df[col_name] = df.apply(lambda r: 1 if r['trip_distance'] >= d and r['trip_distance'] <  d + 0.5 else 0, axis=1 )
    
    d += 0.5
    cn +=  1



1
td2
1.5
td3
2.0
td4
2.5
td5
3.0
td6
3.5
td7
4.0
td8
4.5
td9


In [31]:
# 5 to 10 in steps of 1 km
while d < 10 :
    print (d)
    col_name = 'td' + str(cn)
    print (col_name)
    
    df[col_name] = df.apply(lambda r: 1 if r['trip_distance'] >= d and r['trip_distance'] <  d + 1.0 else 0, axis=1 )
    
    d += 1.0
    cn +=  1

5.0
td10
6.0
td11
7.0
td12
8.0
td13
9.0
td14


In [32]:
# 10 to 20 in steps of 5 km
while d < 20 :
    print (d)
    col_name = 'td' + str(cn)
    print (col_name)
    
    df[col_name] = df.apply(lambda r: 1 if r['trip_distance'] >= d and r['trip_distance'] <  d + 5.0 else 0, axis=1 )
    
    d += 5.0
    cn +=  1

10.0
td15
15.0
td16


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 29 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
trip_distance         1458644 non-null float64
td0                   1458644 non-null int64
td1                   1458644 non-null int64
td2                   1458644 non-null int64
td3                   1458644 non-null int64
td4                   1458644 non-null int64
td5                   1458644 non-null int64
td6                   1458644 non-null int64
td

In [34]:
# Greater than 20 in td17
df['td17'] = df.apply(lambda r: 1 if r['trip_distance'] >= 20  else 0, axis=1 )

In [35]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,td8,td9,td10,td11,td12,td13,td14,td15,td16,td17
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,0,0,0,0,0,0,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,0,0,0,0,0,0,0,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,0,0,1,0,0,0,0,0,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,0,0,0,0,0,0,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df = df.assign(drp_lon = round(df.dropoff_longitude,2),
              drp_lat = round(df.dropoff_latitude,2))

In [37]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,td10,td11,td12,td13,td14,td15,td16,td17,drp_lon,drp_lat
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,0,0,0,0,0,-73.96,40.77
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,0,0,0,0,0,0,-74.0,40.73
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,1,0,0,0,0,0,0,-74.01,40.71
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,0,0,0,0,0,-74.01,40.71
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,0,-73.97,40.78


In [38]:
# Function to round given date time to 15 minutes 
from datetime import datetime
def dtm_round_to_mins(dtm, p_i_min):       
    mi = str((dtm.minute // p_i_min) * p_i_min).zfill(2)
    return (datetime.strptime(dtm.strftime('%Y%m%d%H')+mi,'%Y%m%d%H%M'))

In [39]:
df['drp_dtm'] = df.apply(lambda r: dtm_round_to_mins(r['dropoff_datetime'],15), axis=1)

In [40]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,td11,td12,td13,td14,td15,td16,td17,drp_lon,drp_lat,drp_dtm
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,0,0,0,0,-73.96,40.77,2016-03-14 17:30:00
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,0,0,0,0,0,-74.0,40.73,2016-06-12 00:45:00
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,1,0,0,0,0,0,0,-74.01,40.71,2016-01-19 12:00:00
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,0,0,0,0,-74.01,40.71,2016-04-06 19:30:00
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,-73.97,40.78,2016-03-26 13:30:00


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 33 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
trip_distance         1458644 non-null float64
td0                   1458644 non-null int64
td1                   1458644 non-null int64
td2                   1458644 non-null int64
td3                   1458644 non-null int64
td4                   1458644 non-null int64
td5                   1458644 non-null int64
td6                   1458644 non-null int64
td

In [42]:
df.to_csv('./data/lp_processed.csv', index=False)

#### Create feature sets based on trip duration. 0-15 mins, 15-30 mins, 30-45, 45-60, 60-75, 75-90, 90-105, 105-120, > 120

In [49]:
for i in range(18):
    td_col_name = 'td' + str(i)
    for j in range(0,120,15):
        col_name = 'td' + str(i) + '_' + str(j)
        print (col_name)
        df[col_name] = df.apply(lambda r: r[td_col_name] if r['trip_duration'] >= (j * 60) and r['trip_duration'] <  (j + 15.0) * 60 else 0, axis=1 )
    col_name = 'td' + str(i) + '_gte_120'
    print (col_name)
    df[col_name] = df.apply(lambda r: r[td_col_name] if r['trip_duration'] >= (120 * 60) else 0, axis=1 )
        

td0_0
td0_15
td0_30
td0_45
td0_60
td0_75
td0_90
td0_105
td0_gte_120
td1_0
td1_15
td1_30
td1_45
td1_60
td1_75
td1_90
td1_105
td1_gte_120
td2_0
td2_15
td2_30
td2_45
td2_60
td2_75
td2_90
td2_105
td2_gte_120
td3_0
td3_15
td3_30
td3_45
td3_60
td3_75
td3_90
td3_105
td3_gte_120
td4_0
td4_15
td4_30
td4_45
td4_60
td4_75
td4_90
td4_105
td4_gte_120
td5_0
td5_15
td5_30
td5_45
td5_60
td5_75
td5_90
td5_105
td5_gte_120
td6_0
td6_15
td6_30
td6_45
td6_60
td6_75
td6_90
td6_105
td6_gte_120
td7_0
td7_15
td7_30
td7_45
td7_60
td7_75
td7_90
td7_105
td7_gte_120
td8_0
td8_15
td8_30
td8_45
td8_60
td8_75
td8_90
td8_105
td8_gte_120
td9_0
td9_15
td9_30
td9_45
td9_60
td9_75
td9_90
td9_105
td9_gte_120
td10_0
td10_15
td10_30
td10_45
td10_60
td10_75
td10_90
td10_105
td10_gte_120
td11_0
td11_15
td11_30
td11_45
td11_60
td11_75
td11_90
td11_105
td11_gte_120
td12_0
td12_15
td12_30
td12_45
td12_60
td12_75
td12_90
td12_105
td12_gte_120
td13_0
td13_15
td13_30
td13_45
td13_60
td13_75
td13_90
td13_105
td13_gte_120
td14_0
td14_

In [50]:
df.to_csv('./data/lp_processed_2.csv', index=False)

In [51]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,td16_gte_120,td17_0,td17_15,td17_30,td17_45,td17_60,td17_75,td17_90,td17_105,td17_gte_120
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,0,0,0,0,0,0,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,0,0,0,0,0,0,0,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,0,0,0,0,0,0,0,0,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,0,0,0,0,0,0,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,0,0,0


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Columns: 195 entries, id to td17_gte_120
dtypes: datetime64[ns](3), float64(7), int64(183), object(2)
memory usage: 2.1+ GB
