In [57]:
# Import relevant packages
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn import tree
from IPython.display import Image
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

Some brief cleaning is necessary for the remote lookup table to remove redundant observations from the subsequent analyses.

In [75]:
fares = pd.read_csv('../../data/01-modified-data/MTA-Fare-Card-Cleaned-Gathered.csv')
lookup = pd.read_csv('../../data/00-raw-data/Remote-Station-Lookup.csv')
lookup = lookup.rename(columns={'remote':'remote_station_id'})

# Drop booth column (not used in further analyses)
lookup = lookup.drop(columns=['booth'])
# Drop duplicates and NA values
lookup = lookup.drop_duplicates(subset=['remote_station_id'])
lookup.dropna(inplace=True)
df = fares.merge(lookup,on='remote_station_id')

In [77]:
df['remote_station_id'].value_counts()

R001    2310
R165    2310
R167    2310
R168    2310
R169    2310
        ... 
R401    1924
R413    1891
R094    1869
R305    1396
R454     231
Name: remote_station_id, Length: 455, dtype: int64

In [61]:
X = df[[]]
Y = df[]

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)
print(type(x_train),x_train.shape)
print(type(x_test),x_test.shape)
print(type(y_train),y_train.shape)
print(type(y_test),y_test.shape)

<class 'pandas.core.frame.DataFrame'> (1493, 6)
<class 'pandas.core.frame.DataFrame'> (374, 6)
<class 'pandas.core.series.Series'> (1493,)
<class 'pandas.core.series.Series'> (374,)


In [62]:
model = tree.DecisionTreeClassifier()
model.fit(x_train,y_train)

# Save predictions for later plot use
yp_train = model.predict(x_train)
yp_test = model.predict(x_test)

In [67]:
labels = list(df['route1'].unique())

array([0.23530522, 0.07389819, 0.1191728 , 0.23838348, 0.13277509,
       0.20046522])

In [66]:
print("CLASSIFICATION REPORT FOR TRAINING SET")
print(classification_report(y_train,yp_train,target_names=labels))

CLASSIFICATION REPORT FOR TRAINING SET
              precision    recall  f1-score   support

           A       0.32      0.50      0.39       117
           F       0.60      0.17      0.27       104
           E       0.00      0.00      0.00        27
           H       0.00      0.00      0.00        43
           6       0.83      0.83      0.83         6
           J       0.23      0.36      0.28        87
           7       1.00      0.06      0.12        65
           5       0.21      0.78      0.33       220
           2       0.38      0.14      0.21       141
           M       1.00      0.13      0.23        38
           4       0.00      0.00      0.00        42
           D       0.36      0.13      0.20        60
           3       0.24      0.29      0.26       146
           B       0.00      0.00      0.00         6
           1       0.67      0.10      0.17        41
           L       0.58      0.73      0.65        15
           N       0.00      0.00      0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
print("CLASSIFICATION REPORT FOR TEST SET")
print(classification_report(y_test,yp_test,target_names=labels))

CLASSIFICATION REPORT FOR TEST SET
              precision    recall  f1-score   support

           A       0.18      0.21      0.19        29
           F       0.60      0.18      0.27        34
           E       0.00      0.00      0.00         5
           H       0.00      0.00      0.00        18
           6       0.33      1.00      0.50         1
           J       0.17      0.23      0.20        26
           7       0.00      0.00      0.00         5
           5       0.18      0.77      0.29        53
           2       0.38      0.06      0.10        52
           M       0.00      0.00      0.00         9
           4       0.00      0.00      0.00        10
           D       0.00      0.00      0.00        15
           3       0.20      0.26      0.23        31
           B       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         9
           L       0.00      0.00      0.00         1
           N       0.00      0.00      0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
