# **NYC TAXI FARE CLASSIFICATION**

In this notebook the features of the cleaned dataset are converted in order to be appropriate for classification. (Prediction of fare classes)

##Data preparation

In [None]:
 import pandas as pd

data = pd.read_csv('/content/final_df.csv')

For using the geographical locations of pickup and dropoff those values are associated to the directions nord, south, east, west, centre and others by means of standard deviation and mean value.

In [None]:
stat = data.describe()
std_lat = (stat.iloc[2,2] + stat.iloc[2,4]) / 2
std_lon = (stat.iloc[2,1] + stat.iloc[2,3]) / 2
mean_lat = (stat.iloc[1,2] + stat.iloc[1,4]) / 2
mean_lon = (stat.iloc[1,1] + stat.iloc[1,3]) / 2

trsh_n = mean_lat + std_lat   # Assume north is every value above mean of all latitudes plus it's standard deviation
trsh_s = mean_lat - std_lat
trsh_w = mean_lon + std_lon
trsh_e = mean_lon -std_lon

In [None]:
## create columns for pickup orientation, also for different comninations of main orientations ##
data.loc[(data['pickup_latitude'] >= trsh_n) & (data['pickup_longitude'] >= trsh_w), 'pickup_orientation'] = 'NW_pickup'
data.loc[(data['pickup_latitude'] >= trsh_n) & (data['pickup_longitude'] <= trsh_e), 'pickup_orientation'] = 'NE_pickup'
data.loc[(data['pickup_latitude'] >= trsh_n) & (data['pickup_longitude'] < trsh_w) & (data['pickup_longitude'] > trsh_e), 'pickup_orientation'] = 'N_pickup' 

data.loc[(data['pickup_latitude'] <= trsh_s) & (data['pickup_longitude'] >= trsh_w), 'pickup_orientation'] = 'SW_pickup'   
data.loc[(data['pickup_latitude'] <= trsh_s) & (data['pickup_longitude'] <= trsh_e), 'pickup_orientation'] = 'SE_pickup'
data.loc[(data['pickup_latitude'] <= trsh_s) & (data['pickup_longitude'] < trsh_w) & (data['pickup_longitude'] > trsh_e), 'pickup_orientation'] = 'S_pickup'  

data.loc[(data['pickup_latitude'] < trsh_n) & (data['pickup_latitude']> trsh_s) & (data['pickup_longitude'] < trsh_w) & (data['pickup_longitude'] > trsh_e), 'pickup_orientation'] = 'C_pickup'  

data.loc[(data['pickup_latitude'] < trsh_n) & (data['pickup_latitude']> trsh_s) & (data['pickup_longitude'] >= trsh_w), 'pickup_orientation'] = 'W_pickup'  
data.loc[(data['pickup_latitude'] < trsh_n) & (data['pickup_latitude']> trsh_s) & (data['pickup_longitude'] <= trsh_e), 'pickup_orientation'] = 'E_pickup'  

In [None]:
## dropoff orientation ##
data.loc[(data['dropoff_latitude'] >= trsh_n) & (data['dropoff_longitude'] >= trsh_w), 'dropoff_orientation'] = 'NW_dropoff'    
data.loc[(data['dropoff_latitude'] >= trsh_n) & (data['dropoff_longitude'] <= trsh_e), 'dropoff_orientation'] = 'NE_dropoff' 
data.loc[(data['dropoff_latitude'] >= trsh_n) & (data['dropoff_longitude'] < trsh_w) & (data['dropoff_longitude'] > trsh_e), 'dropoff_orientation'] = 'N_dropoff'       

data.loc[(data['dropoff_latitude'] <= trsh_s) & (data['dropoff_longitude'] >= trsh_w), 'dropoff_orientation'] = 'SW_dropoff'     
data.loc[(data['dropoff_latitude'] <= trsh_s) & (data['dropoff_longitude'] <= trsh_e), 'dropoff_orientation'] = 'SE_dropoff' 
data.loc[(data['dropoff_latitude'] <= trsh_s) & (data['dropoff_longitude'] < trsh_w) & (data['dropoff_longitude'] > trsh_e), 'dropoff_orientation'] = 'S_dropoff'   

data.loc[(data['dropoff_latitude'] < trsh_n) & (data['dropoff_latitude']> trsh_s) & (data['dropoff_longitude'] < trsh_w) & (data['dropoff_longitude'] > trsh_e), 'dropoff_orientation'] = 'C_dropoff'    

data.loc[(data['dropoff_latitude'] < trsh_n) & (data['dropoff_latitude']> trsh_s) & (data['dropoff_longitude'] >= trsh_w), 'dropoff_orientation'] = 'W_dropoff'     
data.loc[(data['dropoff_latitude'] < trsh_n) & (data['dropoff_latitude']> trsh_s) & (data['dropoff_longitude'] <= trsh_e), 'dropoff_orientation'] = 'E_dropoff'   

In [None]:
## Partition of dataseries in classes with same length
import numpy as np
import math as m

def Partition(data, feat_index, partitions):      # Function to define range of the classes, with input: dataset, feature coumn and number of desired partitions

  min=data.describe().iloc[3,feat_index]    ## 5 for distance in x, 0 for fare_amount in data
  max=data.describe().iloc[7,feat_index]
  series_len=max-min

  p=partitions
  f=series_len/p
  l_range = []
  u_range = []
  l_range.append(m.floor(min))

  for i in range(1,p):
    l_range.append(m.ceil(f*i))

  for i in range(1,p+1):
    u_range.append(m.ceil(f*i))

  r1=np.array(l_range).reshape(len(l_range),1)
  r2=np.array(u_range).reshape(len(u_range),1)
  ranges=np.append(r1,r2,axis=1)

  return ranges,min,max       # print ranges of each class

In [None]:
## Assign distance classes
import math as m
R=Partition(data,6,3) # set three classes for distance
R_arr=R[0]

data.loc[(data['D'] >= R_arr[0,0]) & (data['D'] < R_arr[0,1]), 'D_class'] = 'near dist'
data.loc[(data['D'] >= R_arr[1,0]) & (data['D'] < R_arr[1,1]), 'D_class'] = 'medium dist'
data.loc[(data['D'] >= R_arr[2,0]), 'D_class'] = 'far dist'

In [None]:
## Assign distance classes
import math as m
R=Partition(data,0,3)
R_arr=R[0]
rm=len(R[0])

data.loc[(data['fare_amount'] >= R_arr[0,0]) & (data['fare_amount'] < R_arr[0,1]), 'fare_class'] = 'low fare'
data.loc[(data['fare_amount'] >= R_arr[1,0]) & (data['fare_amount'] < R_arr[1,1]), 'fare_class'] = 'medium fare'
data.loc[(data['fare_amount'] >= R_arr[2,0]), 'fare_class'] = 'high fare'

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909157 entries, 0 to 909156
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fare_amount          909157 non-null  float64
 1   pickup_longitude     909157 non-null  float64
 2   pickup_latitude      909157 non-null  float64
 3   dropoff_longitude    909157 non-null  float64
 4   dropoff_latitude     909157 non-null  float64
 5   passenger_count      909157 non-null  int64  
 6   D                    909157 non-null  float64
 7   wd                   909157 non-null  int64  
 8   month                909157 non-null  int64  
 9   year                 909157 non-null  int64  
 10  hour                 909157 non-null  int64  
 11  pickup_orientation   909157 non-null  object 
 12  dropoff_orientation  909157 non-null  object 
 13  D_class              909157 non-null  object 
 14  fare_class           909157 non-null  object 
dtypes: float64(6), in

In [None]:
classes = data.drop(columns=['fare_amount', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'D'])
df = classes # New dataframe containing only nominal values and integers


Numerical values must be assigned to the classes.

In [None]:
## Use encoder to transform labels in numaerical integer values
from sklearn.preprocessing import LabelEncoder
pl = LabelEncoder()

pickup_labels = pl.fit_transform(classes['pickup_orientation'])
pickup_mappings = {index: label for index, label in 
                  enumerate(pl.classes_)}

pickup_mappings

{0: 'C_pickup',
 1: 'E_pickup',
 2: 'NE_pickup',
 3: 'NW_pickup',
 4: 'N_pickup',
 5: 'SE_pickup',
 6: 'SW_pickup',
 7: 'S_pickup',
 8: 'W_pickup'}

In [None]:
dl = LabelEncoder()

dropoff_labels = dl.fit_transform(classes['dropoff_orientation'])
dropoff_mappings = {index: label for index, label in 
                  enumerate(dl.classes_)}

dropoff_mappings

{0: 'C_dropoff',
 1: 'E_dropoff',
 2: 'NE_dropoff',
 3: 'NW_dropoff',
 4: 'N_dropoff',
 5: 'SE_dropoff',
 6: 'SW_dropoff',
 7: 'S_dropoff',
 8: 'W_dropoff'}

In [None]:
classes['pickup'] = pickup_labels
classes['dropoff'] = dropoff_labels
classes

Unnamed: 0,passenger_count,wd,month,year,hour,pickup_orientation,dropoff_orientation,D_class,fare_class,pickup,dropoff
0,1,0,5,2009,17,SW_pickup,SW_dropoff,near dist,low fare,6,6
1,2,3,7,2011,0,C_pickup,C_dropoff,near dist,low fare,0,0
2,1,5,3,2012,4,C_pickup,C_dropoff,near dist,medium fare,0,0
3,1,1,2,2010,7,C_pickup,NW_dropoff,near dist,low fare,0,3
4,1,3,0,2011,9,C_pickup,C_dropoff,medium dist,medium fare,0,0
...,...,...,...,...,...,...,...,...,...,...,...
909152,4,5,8,2014,17,N_pickup,C_dropoff,near dist,medium fare,4,0
909153,4,5,1,2012,18,C_pickup,C_dropoff,near dist,medium fare,0,0
909154,1,4,5,2012,19,C_pickup,C_dropoff,near dist,low fare,0,0
909155,1,4,0,2014,19,C_pickup,C_dropoff,near dist,medium fare,0,0


For each orientation a feature is created which has a binary value, (1 for the actual orientation, zero in the other orientation columns) - having equal weights

In [None]:
# encode orientations using one-hot encoding scheme#
from sklearn.preprocessing import OneHotEncoder

#dropoff
do_ohe = OneHotEncoder()
do_feature_arr = do_ohe.fit_transform(
                                classes[['dropoff']]).toarray()
do_feature_labels = [str(cls_label) 
                           for cls_label in dl.classes_]
do_features = pd.DataFrame(do_feature_arr, 
                            columns=do_feature_labels)


#pickup
pu_ohe = OneHotEncoder()
pu_feature_arr = pu_ohe.fit_transform(
                                classes[['pickup']]).toarray()
pu_feature_labels = [str(cls_label) 
                           for cls_label in pl.classes_]
pu_features = pd.DataFrame(pu_feature_arr, 
                            columns=pu_feature_labels)

In [None]:
do_features

Unnamed: 0,C_dropoff,E_dropoff,NE_dropoff,NW_dropoff,N_dropoff,SE_dropoff,SW_dropoff,S_dropoff,W_dropoff
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
909152,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
909153,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
909154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
909155,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Distance and fare are encoded assigning to each of the three classes ordinal values (0,1,2)

In [None]:
dist_le = LabelEncoder()
dist_labels = dist_le.fit_transform(classes['D_class'])
classes['Dist_Label'] = dist_labels

fare_le = LabelEncoder()
fare_labels = fare_le.fit_transform(classes['fare_class'])
classes['Fare_Label'] = fare_labels

classes

Unnamed: 0,passenger_count,wd,month,year,hour,pickup_orientation,dropoff_orientation,D_class,fare_class,pickup,dropoff,Dist_Label,Fare_Label
0,1,0,5,2009,17,SW_pickup,SW_dropoff,near dist,low fare,6,6,2,1
1,2,3,7,2011,0,C_pickup,C_dropoff,near dist,low fare,0,0,2,1
2,1,5,3,2012,4,C_pickup,C_dropoff,near dist,medium fare,0,0,2,2
3,1,1,2,2010,7,C_pickup,NW_dropoff,near dist,low fare,0,3,2,1
4,1,3,0,2011,9,C_pickup,C_dropoff,medium dist,medium fare,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
909152,4,5,8,2014,17,N_pickup,C_dropoff,near dist,medium fare,4,0,2,2
909153,4,5,1,2012,18,C_pickup,C_dropoff,near dist,medium fare,0,0,2,2
909154,1,4,5,2012,19,C_pickup,C_dropoff,near dist,low fare,0,0,2,1
909155,1,4,0,2014,19,C_pickup,C_dropoff,near dist,medium fare,0,0,2,2


Dataset containing only features relevant to classification is created.

In [None]:
class_df = classes.drop(columns=['pickup_orientation', 'dropoff_orientation', 'D_class', 'fare_class', 'pickup', 'dropoff'])
class_df = pd.concat([class_df, pu_features, do_features], axis=1)

class_df = class_df.astype(dtype=int)
class_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909157 entries, 0 to 909156
Data columns (total 25 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   passenger_count  909157 non-null  int64
 1   wd               909157 non-null  int64
 2   month            909157 non-null  int64
 3   year             909157 non-null  int64
 4   hour             909157 non-null  int64
 5   Dist_Label       909157 non-null  int64
 6   Fare_Label       909157 non-null  int64
 7   C_pickup         909157 non-null  int64
 8   E_pickup         909157 non-null  int64
 9   NE_pickup        909157 non-null  int64
 10  NW_pickup        909157 non-null  int64
 11  N_pickup         909157 non-null  int64
 12  SE_pickup        909157 non-null  int64
 13  SW_pickup        909157 non-null  int64
 14  S_pickup         909157 non-null  int64
 15  W_pickup         909157 non-null  int64
 16  C_dropoff        909157 non-null  int64
 17  E_dropoff        909157 non-n

In [None]:
X = class_df.drop(columns=['Fare_Label'])
m, n = X.shape
X = X.values.reshape(m,n)
y = class_df['Fare_Label'].values.reshape(m,1)

In [None]:
import sklearn
from sklearn import model_selection
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,test_size=0.1)

##Classification

Decision Tree Classification using all features

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)

Print evaluation metrics for classification

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(y_test, y_pred)
tn = mcm[:, 0, 0]
tp = mcm[:, 1, 1]
fn = mcm[:, 1, 0]
fp = mcm[:, 0, 1]

## evaluation metrics for each fare class (low, medium , high) ##
acc_class = (tp + tn )/ (tp + tn + fn + fp) 
precision_class = tp / (tp + fp)  # The higher the precision, the lower the FPs
recall_class = tp / (tp + fn)     # The higher the recall, the lower the FNs
F1_measure = (2*precision_class*recall_class)/(recall_class+precision_class)    # It is high when both p and r are high
acc_class, precision_class, recall_class, F1_measure

(array([0.88125302, 0.70351753, 0.61143253]),
 array([0.54005503, 0.60498889, 0.60961963]),
 array([0.56626506, 0.69086499, 0.53138094]),
 array([0.55284957, 0.64508144, 0.56781786]))

To include additional classes that take into account some common places in New York for which there might be fixed fares, classes of distance between dropoff and the place are created. For example for JFK Airport or Times Square and others.

In [None]:
import geopy.distance
def jfk_dist(trip):
    jfk_lat = 40.6413
    jfk_long = -73.7781
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    jfk_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (jfk_lat, jfk_long)).km
    return jfk_distance

def lga_dist(trip):
    lga_lat = 40.7769
    lga_long = -73.8740
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    lga_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (lga_lat, lga_long)).km
    return lga_distance

def ewr_dist(trip):
    ewr_lat = 40.6895
    ewr_long = -74.1745
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    ewr_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (ewr_lat, ewr_long)).km
    return ewr_distance

def tsq_dist(trip):
    tsq_lat = 40.7580
    tsq_long = -73.9855
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    tsq_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (tsq_lat, tsq_long)).km
    return tsq_distance

def cpk_dist(trip):
    cpk_lat = 40.7812
    cpk_long = -73.9665
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    cpk_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (cpk_lat, cpk_long)).km
    return cpk_distance
def lib_dist(trip):
    lib_lat = 40.6892
    lib_long = -74.0445
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    lib_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (lib_lat, lib_long)).km
    return lib_distance

def gct_dist(trip):
    gct_lat = 40.7527
    gct_long = -73.9772
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    gct_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (gct_lat, gct_long)).km
    return gct_distance

def met_dist(trip):
    met_lat = 40.7794
    met_long = -73.9632
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    met_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (met_lat, met_long)).km
    return met_distance

def wtc_dist(trip):
    wtc_lat = 40.7126
    wtc_long = -74.0099
    dropoff_lat = trip['dropoff_latitude']
    dropoff_long = trip['dropoff_longitude']
    wtc_distance = geopy.distance.vincenty((dropoff_lat, dropoff_long), (wtc_lat, wtc_long)).km
    return wtc_distance

In [None]:
data['jfk'] = data.apply(lambda x: jfk_dist(x), axis = 1 )  # Add new distance features to the dataset
data['lga'] = data.apply(lambda x: lga_dist(x), axis = 1 )
data['ewr'] = data.apply(lambda x: ewr_dist(x), axis = 1 )
data['tsq'] = data.apply(lambda x: tsq_dist(x), axis = 1 )
data['cpk'] = data.apply(lambda x: cpk_dist(x), axis = 1 )
data['lib'] = data.apply(lambda x: lib_dist(x), axis = 1 )
data['gct'] = data.apply(lambda x: gct_dist(x), axis = 1 )
data['met'] = data.apply(lambda x: met_dist(x), axis = 1 )    
data['wtc'] = data.apply(lambda x: wtc_dist(x), axis = 1 )
data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,D,wd,month,year,hour,jfk,lga,ewr,tsq,cpk,lib,gct,met,wtc
count,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0,909157.0
mean,8.862969,-73.980402,40.752161,-73.978825,40.752365,1.688241,2.462914,3.039539,5.253983,2011.717349,13.574978,21.186889,9.648411,18.107748,2.777785,4.051245,9.136442,2.718326,3.988597,5.490098
std,4.001827,0.02112,0.023561,0.022389,0.025693,1.304325,1.656021,1.945607,3.442174,1.864384,6.481661,1.668214,2.094774,2.423625,2.169955,2.576606,2.869309,2.084988,2.561296,2.822475
min,2.5,-74.489632,40.06092,-74.481633,40.065162,1.0,0.100037,0.0,0.0,2009.0,0.0,0.174493,0.102793,0.318467,0.001672,0.056302,0.049113,0.002989,0.024971,0.002452
25%,5.7,-73.992573,40.737464,-73.991896,40.737335,1.0,1.22282,1.0,2.0,2010.0,9.0,20.649883,8.39268,16.488343,1.234403,2.083675,7.099706,1.260161,2.043153,3.475701
50%,8.0,-73.982503,40.753329,-73.981472,40.753984,1.0,1.992605,3.0,5.0,2012.0,14.0,21.257228,9.541102,17.90488,2.335722,3.53012,9.016702,2.330979,3.495834,5.348377
75%,11.0,-73.970515,40.76681,-73.96857,40.767712,2.0,3.270613,5.0,8.0,2013.0,19.0,21.944881,10.916606,19.469653,3.697951,5.560892,10.922675,3.581107,5.513608,7.252404
max,22.2,-73.036247,40.999287,-73.02977,40.997905,6.0,8.443222,6.0,11.0,2015.0,23.0,73.616677,88.644546,106.004672,95.313015,95.341245,96.004662,94.407916,94.99866,94.587633


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909157 entries, 0 to 909156
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fare_amount          909157 non-null  float64
 1   pickup_longitude     909157 non-null  float64
 2   pickup_latitude      909157 non-null  float64
 3   dropoff_longitude    909157 non-null  float64
 4   dropoff_latitude     909157 non-null  float64
 5   passenger_count      909157 non-null  int64  
 6   D                    909157 non-null  float64
 7   wd                   909157 non-null  int64  
 8   month                909157 non-null  int64  
 9   year                 909157 non-null  int64  
 10  hour                 909157 non-null  int64  
 11  pickup_orientation   909157 non-null  object 
 12  dropoff_orientation  909157 non-null  object 
 13  D_class              909157 non-null  object 
 14  fare_class           909157 non-null  object 
 15  jfk              

In [None]:
nyc_places = data.iloc[:,15:24]
nyc_places.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909157 entries, 0 to 909156
Data columns (total 9 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   jfk     909157 non-null  float64
 1   lga     909157 non-null  float64
 2   ewr     909157 non-null  float64
 3   tsq     909157 non-null  float64
 4   cpk     909157 non-null  float64
 5   lib     909157 non-null  float64
 6   gct     909157 non-null  float64
 7   met     909157 non-null  float64
 8   wtc     909157 non-null  float64
dtypes: float64(9)
memory usage: 62.4 MB


In [None]:
# Building classes for distance between dropoff and common place in NYC - 3 different distance ranges
import math as m
R=Partition(nyc_places,0,3)  #jfk
R_arr=R[0]

nyc_places.loc[(nyc_places['jfk'] >= R_arr[0,0]) & (nyc_places['jfk'] < R_arr[0,1]), 'jfk_class'] = 'near dist'
nyc_places.loc[(nyc_places['jfk'] >= R_arr[1,0]) & (nyc_places['jfk'] < R_arr[1,1]), 'jfk_class'] = 'medium dist'
nyc_places.loc[(nyc_places['jfk'] >= R_arr[2,0]), 'jfk_class'] = 'far dist'

R=Partition(nyc_places,1,3)  #lga
R_arr=R[0]

nyc_places.loc[(nyc_places['lga'] >= R_arr[0,0]) & (nyc_places['lga'] < R_arr[0,1]), 'lga_class'] = 'near dist'
nyc_places.loc[(nyc_places['lga'] >= R_arr[1,0]) & (nyc_places['lga'] < R_arr[1,1]), 'lga_class'] = 'medium dist'
nyc_places.loc[(nyc_places['lga'] >= R_arr[2,0]), 'lga_class'] = 'far dist'

R=Partition(nyc_places,2,3)  #ewr
R_arr=R[0]

nyc_places.loc[(nyc_places['ewr'] >= R_arr[0,0]) & (nyc_places['ewr'] < R_arr[0,1]), 'ewr_class'] = 'near dist'
nyc_places.loc[(nyc_places['ewr'] >= R_arr[1,0]) & (nyc_places['ewr'] < R_arr[1,1]), 'ewr_class'] = 'medium dist'
nyc_places.loc[(nyc_places['ewr'] >= R_arr[2,0]), 'ewr_class'] = 'far dist'

R=Partition(nyc_places,3,3)  #tsq
R_arr=R[0]

nyc_places.loc[(nyc_places['tsq'] >= R_arr[0,0]) & (nyc_places['tsq'] < R_arr[0,1]), 'tsq_class'] = 'near dist'
nyc_places.loc[(nyc_places['tsq'] >= R_arr[1,0]) & (nyc_places['tsq'] < R_arr[1,1]), 'tsq_class'] = 'medium dist'
nyc_places.loc[(nyc_places['tsq'] >= R_arr[2,0]), 'tsq_class'] = 'far dist'

R=Partition(nyc_places,4,3)  #cpk
R_arr=R[0]

nyc_places.loc[(nyc_places['cpk'] >= R_arr[0,0]) & (nyc_places['cpk'] < R_arr[0,1]), 'cpk_class'] = 'near dist'
nyc_places.loc[(nyc_places['cpk'] >= R_arr[1,0]) & (nyc_places['cpk'] < R_arr[1,1]), 'cpk_class'] = 'medium dist'
nyc_places.loc[(nyc_places['cpk'] >= R_arr[2,0]), 'cpk_class'] = 'far dist'

R=Partition(nyc_places,5,3)  #lib
R_arr=R[0]

nyc_places.loc[(nyc_places['lib'] >= R_arr[0,0]) & (nyc_places['lib'] < R_arr[0,1]), 'lib_class'] = 'near dist'
nyc_places.loc[(nyc_places['lib'] >= R_arr[1,0]) & (nyc_places['lib'] < R_arr[1,1]), 'lib_class'] = 'medium dist'
nyc_places.loc[(nyc_places['lib'] >= R_arr[2,0]), 'lib_class'] = 'far dist'

R=Partition(nyc_places,6,3)  #gct
R_arr=R[0]

nyc_places.loc[(nyc_places['gct'] >= R_arr[0,0]) & (nyc_places['gct'] < R_arr[0,1]), 'gct_class'] = 'near dist'
nyc_places.loc[(nyc_places['gct'] >= R_arr[1,0]) & (nyc_places['gct'] < R_arr[1,1]), 'gct_class'] = 'medium dist'
nyc_places.loc[(nyc_places['gct'] >= R_arr[2,0]), 'gct_class'] = 'far dist'

R=Partition(nyc_places,7,3)  #met
R_arr=R[0]

nyc_places.loc[(nyc_places['met'] >= R_arr[0,0]) & (nyc_places['met'] < R_arr[0,1]), 'met_class'] = 'near dist'
nyc_places.loc[(nyc_places['met'] >= R_arr[1,0]) & (nyc_places['met'] < R_arr[1,1]), 'met_class'] = 'medium dist'
nyc_places.loc[(nyc_places['met'] >= R_arr[2,0]), 'met_class'] = 'far dist'

R=Partition(nyc_places,8,3)  #wtc
R_arr=R[0]

nyc_places.loc[(nyc_places['wtc'] >= R_arr[0,0]) & (nyc_places['wtc'] < R_arr[0,1]), 'wtc_class'] = 'near dist'
nyc_places.loc[(nyc_places['wtc'] >= R_arr[1,0]) & (nyc_places['wtc'] < R_arr[1,1]), 'wtc_class'] = 'medium dist'
nyc_places.loc[(nyc_places['wtc'] >= R_arr[2,0]), 'wtc_class'] = 'far dist'

In [None]:
dist_places = LabelEncoder()
dist_labels0 = dist_places.fit_transform(nyc_places['jfk_class'])
nyc_places['jfk'] = dist_labels0
dist_labels1 = dist_places.fit_transform(nyc_places['lga_class'])
nyc_places['lga'] = dist_labels1
dist_labels2 = dist_places.fit_transform(nyc_places['ewr_class'])
nyc_places['ewr'] = dist_labels2
dist_labels3 = dist_places.fit_transform(nyc_places['tsq_class'])
nyc_places['tsq'] = dist_labels3
dist_labels4 = dist_places.fit_transform(nyc_places['cpk_class'])
nyc_places['cpk'] = dist_labels4
dist_labels5 = dist_places.fit_transform(nyc_places['lib_class'])
nyc_places['lib'] = dist_labels5
dist_labels6 = dist_places.fit_transform(nyc_places['gct_class'])
nyc_places['gct'] = dist_labels6
dist_labels7 = dist_places.fit_transform(nyc_places['met_class'])
nyc_places['met'] = dist_labels7
dist_labels8 = dist_places.fit_transform(nyc_places['wtc_class'])
nyc_places['wtc'] = dist_labels8
nyc_places

Unnamed: 0,jfk,lga,ewr,tsq,cpk,lib,gct,met,wtc,jfk_class,lga_class,ewr_class,tsq_class,cpk_class,lib_class,gct_class,met_class,wtc_class
0,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
1,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
2,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
3,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
4,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909152,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
909153,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
909154,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist
909155,2,2,2,2,2,2,2,2,2,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist,near dist


In [None]:
nyc_place_classes = pd.concat([class_df, nyc_places.iloc[:,:9]], axis=1)
nyc_place_classes

Unnamed: 0,passenger_count,wd,month,year,hour,Dist_Label,Fare_Label,C_pickup,E_pickup,NE_pickup,NW_pickup,N_pickup,SE_pickup,SW_pickup,S_pickup,W_pickup,C_dropoff,E_dropoff,NE_dropoff,NW_dropoff,N_dropoff,SE_dropoff,SW_dropoff,S_dropoff,W_dropoff,jfk,lga,ewr,tsq,cpk,lib,gct,met,wtc
0,1,0,5,2009,17,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2,2,2,2,2,2,2,2,2
1,2,3,7,2011,0,2,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2
2,1,5,3,2012,4,2,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2
3,1,1,2,2010,7,2,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,2,2,2,2,2,2,2,2
4,1,3,0,2011,9,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909152,4,5,8,2014,17,2,2,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2
909153,4,5,1,2012,18,2,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2
909154,1,4,5,2012,19,2,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2
909155,1,4,0,2014,19,2,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2


Decision Tree Classifier is again applied, now for the new dataset

In [None]:
y = nyc_place_classes.iloc[:,6].values
X = nyc_place_classes.drop(columns=['Fare_Label'])
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,test_size=0.1)
clf2 = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
y_pred = clf2.predict(X_test)

mcm = multilabel_confusion_matrix(y_test, y_pred)
tn = mcm[:, 0, 0]
tp = mcm[:, 1, 1]
fn = mcm[:, 1, 0]
fp = mcm[:, 0, 1]

## evaluation metrics for each fare class (low, medium , high) ##
acc_class = (tp + tn )/ (tp + tn + fn + fp) 
precision_class = tp / (tp + fp)  # The higher the precision, the lower the FPs
recall_class = tp / (tp + fn)     # The higher the recall, the lower the FNs
F1_measure = (2*precision_class*recall_class)/(recall_class+precision_class)    # It is high when both p and r are high
acc_class, precision_class, recall_class, F1_measure

(array([0.88195697, 0.70237362, 0.6105306 ]),
 array([0.54242523, 0.60514308, 0.60698428]),
 array([0.56674592, 0.6866541 , 0.53299236]),
 array([0.55431894, 0.64332696, 0.56758704]))

Boosting classifier is applied to increase the performance

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier

clf = HistGradientBoostingClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
mcm = multilabel_confusion_matrix(y_test, y_pred)
tn = mcm[:, 0, 0]
tp = mcm[:, 1, 1]
fn = mcm[:, 1, 0]
fp = mcm[:, 0, 1]

## evaluation metrics for each fare class (low, medium , high) ##
acc_class = (tp + tn )/ (tp + tn + fn + fp) 
precision_class = tp / (tp + fp)  # The higher the precision, the lower the FPs
recall_class = tp / (tp + fn)     # The higher the recall, the lower the FNs
F1_measure = (2*precision_class*recall_class)/(recall_class+precision_class)    # It is high when both p and r are high
acc_class, precision_class, recall_class, F1_measure

(array([0.91945312, 0.75164987, 0.68018831]),
 array([0.74529029, 0.66310093, 0.67246907]),
 array([0.57447351, 0.74129829, 0.64945758]),
 array([0.64882751, 0.70002259, 0.66076304]))

In [None]:
nyc_apr = pd.concat([df,nyc_places.iloc[:,:9]], axis=1)
nyc_apr.to_csv(r'/content/sample_data/nyc_apr.csv',index=None)