In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve, precision_score, recall_score, precision_recall_curve
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
df = pd.read_csv('nyc_taxi_trip_duration.csv')
df

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.961670,40.759720,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.017120,40.708469,-73.988182,40.740631,N,848
5,id1918069,2,2016-02-14 18:31:42,2016-02-14 18:55:57,2,-73.993614,40.751884,-73.995422,40.723862,N,1455
6,id2429028,1,2016-04-20 20:30:14,2016-04-20 20:36:51,1,-73.965080,40.758915,-73.976807,40.764107,N,397
7,id1663798,2,2016-06-19 16:48:14,2016-06-19 17:06:35,1,-73.963890,40.765434,-73.872429,40.774200,N,1101
8,id2436943,2,2016-03-28 19:17:03,2016-03-28 19:48:29,2,-73.872887,40.774281,-73.979019,40.761879,N,1886
9,id2933909,1,2016-04-10 22:01:41,2016-04-10 22:25:30,1,-73.987823,40.740982,-73.999153,40.686451,N,1429


In [4]:
#Now that the file has been read, it is time to have a look at the missing
#values
pd.isnull(df).sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [4]:
#Since there are no missing values, we can go onto the next part of the 
#process which is preprocessing
#we know that the date time pickup can be subdivided into weekdays and time
#so let us do that in order to get a better understanding
dfDay = df

from datetime import date
import calendar
my_date = date.today()
import datetime
arr = []
def generateDay(feature):
    for i in range(0,len(df[feature])):
        date = df[feature][i].split()[0]
        year, month, day = (int(x) for x in date.split('-'))
        ans = datetime.date(year, month, day)
        arr.append(ans.strftime("%A"))
generateDay('pickup_datetime')
arr
dfDay['weekday'] = arr

ar = []
def generateDay(feature):
    for i in range(0,len(dfDay[feature])):
        time = dfDay[feature][i].split()[1]
        hour, minute, sec = (int(x) for x in time.split(':'))
        if (hour > 21):
            ar.append('Night: 9 - 6 am')
        elif(hour <= 21 and hour > 19):
            ar.append('Evening Office Hours: 7 - 9 pm')
        elif(hour <= 19 and hour > 16):
            ar.append('Evening  Hours: 4 - 7 pm')
        elif(hour <= 16 and hour > 12):
            ar.append('Afternoon: 12 - 4 pm')
        elif(hour <= 12 and hour > 9):
            ar.append('Morning Office Hours: 9 - 12 pm')
        else:
            ar.append('Morning: 6 - 9 pm')
generateDay('pickup_datetime')
dfDay['Time_Of_Day'] = ar
dfDay

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,weekday,Time_Of_Day
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400,Monday,Afternoon: 12 - 4 pm
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100,Friday,Night: 9 - 6 am
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635,Sunday,Evening Hours: 4 - 7 pm
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.961670,40.759720,-73.956779,40.780628,N,1141,Tuesday,Morning: 6 - 9 pm
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.017120,40.708469,-73.988182,40.740631,N,848,Wednesday,Morning: 6 - 9 pm
5,id1918069,2,2016-02-14 18:31:42,2016-02-14 18:55:57,2,-73.993614,40.751884,-73.995422,40.723862,N,1455,Sunday,Evening Hours: 4 - 7 pm
6,id2429028,1,2016-04-20 20:30:14,2016-04-20 20:36:51,1,-73.965080,40.758915,-73.976807,40.764107,N,397,Wednesday,Evening Office Hours: 7 - 9 pm
7,id1663798,2,2016-06-19 16:48:14,2016-06-19 17:06:35,1,-73.963890,40.765434,-73.872429,40.774200,N,1101,Sunday,Afternoon: 12 - 4 pm
8,id2436943,2,2016-03-28 19:17:03,2016-03-28 19:48:29,2,-73.872887,40.774281,-73.979019,40.761879,N,1886,Monday,Evening Hours: 4 - 7 pm
9,id2933909,1,2016-04-10 22:01:41,2016-04-10 22:25:30,1,-73.987823,40.740982,-73.999153,40.686451,N,1429,Sunday,Night: 9 - 6 am


In [5]:
#now that we have extracted the date and time, we can categorize it
dfDay = pd.concat([dfDay,pd.get_dummies(dfDay['weekday'],prefix = str('day'),prefix_sep='_')],axis = 1)
dfDay = pd.concat([dfDay,pd.get_dummies(dfDay['Time_Of_Day'],prefix = str('Time'),prefix_sep='_') ],axis = 1)
dfDay

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,Time_Afternoon: 12 - 4 pm,Time_Evening Hours: 4 - 7 pm,Time_Evening Office Hours: 7 - 9 pm,Time_Morning Office Hours: 9 - 12 pm,Time_Morning: 6 - 9 pm,Time_Night: 9 - 6 am
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,...,0,0,0,0,1,0,0,0,0,0
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,...,0,0,0,0,0,0,0,0,0,1
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,...,1,0,0,0,0,1,0,0,0,0
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.961670,40.759720,-73.956779,40.780628,N,...,0,0,1,0,0,0,0,0,1,0
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.017120,40.708469,-73.988182,40.740631,N,...,0,0,0,1,0,0,0,0,1,0
5,id1918069,2,2016-02-14 18:31:42,2016-02-14 18:55:57,2,-73.993614,40.751884,-73.995422,40.723862,N,...,1,0,0,0,0,1,0,0,0,0
6,id2429028,1,2016-04-20 20:30:14,2016-04-20 20:36:51,1,-73.965080,40.758915,-73.976807,40.764107,N,...,0,0,0,1,0,0,1,0,0,0
7,id1663798,2,2016-06-19 16:48:14,2016-06-19 17:06:35,1,-73.963890,40.765434,-73.872429,40.774200,N,...,1,0,0,0,1,0,0,0,0,0
8,id2436943,2,2016-03-28 19:17:03,2016-03-28 19:48:29,2,-73.872887,40.774281,-73.979019,40.761879,N,...,0,0,0,0,0,1,0,0,0,0
9,id2933909,1,2016-04-10 22:01:41,2016-04-10 22:25:30,1,-73.987823,40.740982,-73.999153,40.686451,N,...,1,0,0,0,0,0,0,0,0,1


In [6]:
#We don't have to scale anything as there aren't any outliers persay
#adding a new column distance, because we can't predict anything just
#based on longitudes and latitudes in raw form
pickup = []
from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
R = 6373.0
#Calculating the distance between the pickup and drop off location

for i in range(0, len(df)):
    lat1 = radians(dfDay['pickup_latitude'][i])
    lon1 = radians(dfDay['pickup_longitude'][i])
    lat2 = radians(dfDay['dropoff_latitude'][i])
    lon2 = radians(dfDay['dropoff_longitude'][i])
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = (sin(dlat / 2)**2) + cos(lat1) * cos(lat2) * (sin(dlon / 2)**2)
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    pickup.append(distance)

dfDay["Distance"] = pickup
dfDay


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,day_Thursday,day_Tuesday,day_Wednesday,Time_Afternoon: 12 - 4 pm,Time_Evening Hours: 4 - 7 pm,Time_Evening Office Hours: 7 - 9 pm,Time_Morning Office Hours: 9 - 12 pm,Time_Morning: 6 - 9 pm,Time_Night: 9 - 6 am,Distance
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,...,0,0,0,1,0,0,0,0,0,1.199449
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,...,0,0,0,0,0,0,0,0,1,4.130407
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,...,0,0,0,0,1,0,0,0,0,7.253029
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.961670,40.759720,-73.956779,40.780628,N,...,0,1,0,0,0,0,0,1,0,2.361839
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.017120,40.708469,-73.988182,40.740631,N,...,0,0,1,0,0,0,0,1,0,4.329893
5,id1918069,2,2016-02-14 18:31:42,2016-02-14 18:55:57,2,-73.993614,40.751884,-73.995422,40.723862,N,...,0,0,0,0,1,0,0,0,0,3.120691
6,id2429028,1,2016-04-20 20:30:14,2016-04-20 20:36:51,1,-73.965080,40.758915,-73.976807,40.764107,N,...,0,0,1,0,0,1,0,0,0,1.144338
7,id1663798,2,2016-06-19 16:48:14,2016-06-19 17:06:35,1,-73.963890,40.765434,-73.872429,40.774200,N,...,0,0,0,1,0,0,0,0,0,7.766026
8,id2436943,2,2016-03-28 19:17:03,2016-03-28 19:48:29,2,-73.872887,40.774281,-73.979019,40.761879,N,...,0,0,0,0,1,0,0,0,0,9.046486
9,id2933909,1,2016-04-10 22:01:41,2016-04-10 22:25:30,1,-73.987823,40.740982,-73.999153,40.686451,N,...,0,0,0,0,0,0,0,0,1,6.140237


In [7]:
#Now its time to choose the baseline column which I feel play the most 
#important role in affecting the duration of the time
baseline_cols = ['day_Friday','day_Monday', 'day_Saturday', 'day_Sunday', u'day_Thursday',
       'day_Tuesday', 'day_Wednesday', 'Time_Afternoon: 12 - 4 pm',
       'Time_Evening  Hours: 4 - 7 pm',
       'Time_Evening Office Hours: 7 - 9 pm',
       'Time_Morning Office Hours: 9 - 12 pm', 'Time_Morning: 6 - 9 pm',
       'Time_Night: 9 - 6 am', 'Distance']

In [8]:
df_baseline = dfDay[baseline_cols]
df_baseline
y_all = df.trip_duration
y_all

0           400
1          1100
2          1635
3          1141
4           848
5          1455
6           397
7          1101
8          1886
9          1429
10          527
11          911
12          311
13         1070
14          661
15        86164
16          891
17         1547
18          717
19         1375
20          440
21         1937
22         1905
23          325
24          186
25          405
26         2350
27          283
28         1277
29          320
          ...  
729292     1628
729293      399
729294      701
729295      576
729296      141
729297      687
729298      400
729299      982
729300      352
729301     1110
729302      530
729303     1041
729304      458
729305      594
729306     3306
729307      642
729308     1497
729309      636
729310      110
729311      444
729312     1526
729313     1325
729314     1338
729315      110
729316      449
729317      296
729318      315
729319      673
729320      447
729321     1224
Name: trip_duration, Len

In [9]:
#Now it is time to start training our baseline data-Set
xtrain, xtest, ytrain, ytest = train_test_split(df_baseline,y_all,test_size=0.33, random_state=42)

In [11]:
ytrain

533076    2552
139721     421
216358    1876
220721    2400
683398     405
606112    1084
94491      549
136565     819
511550     122
291103    1388
541817    1371
390764     527
286512     235
245005     581
520917    2843
423665     296
401955     354
338082     844
472509     407
348212     354
575550    1033
288897    1350
284130     998
155293     275
516731    1933
336050     481
599314    1114
723769     736
216659     516
470901     413
          ... 
500186    1850
486232    1179
258795     781
421909     421
347449    1661
184779     836
214176     714
235796    1004
103355     360
199041    2005
327069     805
718315     295
321879     168
64820       48
329365     894
41090      550
278167     280
191335     481
175203     123
87498      773
521430     638
137337     148
54886     1375
110268    1555
644167     668
259178     501
365838     298
131932     132
671155     860
121958     751
Name: trip_duration, Length: 488645, dtype: int64

In [None]:
model = LogisticRegression()
model.fit(xtrain,ytrain)
pred = model.predict_proba(xtest)[:,1]

In [None]:
pred

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(ytest,pred) 
auc = roc_auc_score(ytest, pred) 
plt.figure(figsize=(12,8)) 
plt.plot(fpr,tpr,label="Validation AUC-ROC="+str(auc)) 
x = np.linspace(0, 1, 1000)
plt.plot(x, x, linestyle='-')
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.legend(loc=4) 
plt.show()

In [None]:
def cv_score(ml_model, rstate = 12, thres = 0.5, cols = df.columns):
    i = 1
    cv_scores = []
    df1 = df.copy()
    df1 = df[cols]
    
    # 5 Fold cross validation stratified on the basis of target
    kf = StratifiedKFold(n_splits=5,random_state=rstate,shuffle=True)
    for df_index,test_index in kf.split(df1,y_all):
        print('\n{} of kfold {}'.format(i,kf.n_splits))
        xtr,xvl = df1.loc[df_index],df1.loc[test_index]
        ytr,yvl = y_all.loc[df_index],y_all.loc[test_index]
            
        # Define model for fitting on the training set for each fold
        model = ml_model
        model.fit(xtr, ytr)
        pred_probs = model.predict_proba(xvl)
        pp = []
         
        # Use threshold to define the classes based on probability values
        for j in pred_probs[:,1]:
            if j>thres:
                pp.append(1)
            else:
                pp.append(0)
         
        # Calculate scores for each fold and print
        pred_val = pp
        roc_score = roc_auc_score(yvl,pred_probs[:,1])
        recall = recall_score(yvl,pred_val)
        precision = precision_score(yvl,pred_val)
        sufix = ""
        msg = ""
        msg += "ROC AUC Score: {}, Recall Score: {:.4f}, Precision Score: {:.4f} ".format(roc_score, recall,precision)
        print("{}".format(msg))
         
         # Save scores
        cv_scores.append(roc_score)
        i+=1
    return cv_scores

baseline_scores = cv_score(LogisticRegression(), cols = baseline_cols)

In [None]:
all_feat_scores = cv_score(LogisticRegression())

In [None]:
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Create the RFE object and rank each feature
model = LogisticRegression()
rfe = RFE(estimator=model, n_features_to_select=1, step=1)
rfe.fit(dfDay, y_all)

In [None]:
ranking_df = pd.DataFrame()
ranking_df['Feature_name'] = df.columns
ranking_df['Rank'] = rfe.ranking_
ranked = ranking_df.sort_values(by=['Rank'])

In [None]:
rfe_top_10_scores = cv_score(LogisticRegression(), cols = ranked['Feature_name'][:10].values)
results_df = pd.DataFrame({'baseline':baseline_scores, 'all_feats': all_feat_scores, 'rfe_top_10': rfe_top_10_scores})
results_df.plot(y=["baseline", "all_feats", "rfe_top_10"], kind="bar")