# Models

Modeling may take time to execute and is optionally available to execute from the command line.

```bash
python ./scripts/model.py
```

In [1]:
# %load -r 1-3 ../scripts/model.py
# load pandas for data analysis
import pandas as pd  # noqa E401
import numpy as np

In [2]:
# load dataset
parking_violations = pd.read_csv('../data/final/all_cleaned.csv')

In [3]:
# head of dataset
parking_violations.head()

Unnamed: 0,ISSUE_DATE,ISSUE_TIME,ISSUING_AGENCY_CODE,ISSUING_AGENCY_SHORT,VIOLATION_CODE,DISPOSITION_CODE,FINE_AMOUNT,TOTAL_PAID,LATITUDE,LONGITUDE,FORMAT_DATE,DAY_OF_MONTH,MONTH,FORMAT_TIME,HOUR,DISPOSITION_RESULT
0,2018/02/09 05:00:00+00,09:22 AM,28,MPD-CIC,P012,0,30,30,38.894,-77.019,2018-02-09 05:00:00+00:00,9,2,09:22:00,9,0
1,2018/02/09 05:00:00+00,09:24 AM,28,MPD-CIC,P012,75,30,0,38.894,-77.019,2018-02-09 05:00:00+00:00,9,2,09:24:00,9,1
2,2018/02/09 05:00:00+00,09:26 AM,28,MPD-CIC,P012,13,30,0,38.894,-77.019,2018-02-09 05:00:00+00:00,9,2,09:26:00,9,1
3,2018/02/09 05:00:00+00,09:43 AM,28,MPD-CIC,P012,13,30,0,38.894,-77.019,2018-02-09 05:00:00+00:00,9,2,09:43:00,9,1
4,2018/02/21 05:00:00+00,10:43 AM,28,MPD-CIC,P012,0,30,30,38.894,-77.019,2018-02-21 05:00:00+00:00,21,2,10:43:00,10,0


In [4]:
# dataset info
parking_violations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366755 entries, 0 to 1366754
Data columns (total 16 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   ISSUE_DATE            1366755 non-null  object 
 1   ISSUE_TIME            1366755 non-null  object 
 2   ISSUING_AGENCY_CODE   1366755 non-null  int64  
 3   ISSUING_AGENCY_SHORT  1366755 non-null  object 
 4   VIOLATION_CODE        1366755 non-null  object 
 5   DISPOSITION_CODE      1366755 non-null  int64  
 6   FINE_AMOUNT           1366755 non-null  int64  
 7   TOTAL_PAID            1366755 non-null  int64  
 8   LATITUDE              1366755 non-null  float64
 9   LONGITUDE             1366755 non-null  float64
 10  FORMAT_DATE           1366755 non-null  object 
 11  DAY_OF_MONTH          1366755 non-null  int64  
 12  MONTH                 1366755 non-null  int64  
 13  FORMAT_TIME           1366755 non-null  object 
 14  HOUR                  1366755 non-

In [7]:
# %load -r 7-18 ../scripts/model.py
features = ['LATITUDE', 'LONGITUDE', 'MONTH', 'DAY_OF_MONTH', 'HOUR']

# establish data for classification
X = parking_violations[features]  # noqa F821
y = parking_violations['DISPOSITION_RESULT']  # noqa F821

from sklearn.preprocessing import StandardScaler  # noqa E402
X = StandardScaler().fit_transform(X)

from sklearn.model_selection import train_test_split  # noqa E402
X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# %load -r 20-26 ../scripts/model.py
from sklearn.linear_model import LogisticRegression  # noqa E402

log_reg = LogisticRegression(penalty='none', random_state=42)
log_reg.fit(X_train, y_train)

print('Score - Training: {:f}'.format(log_reg.score(X_train, y_train)))
print('Score - Test: {:f}'.format(log_reg.score(X_test, y_test)))

Score - Training: 0.609241
Score - Test: 0.608972


In [7]:
# %load -r 28-36 ../scripts/model.py
from sklearn.linear_model import PassiveAggressiveClassifier  # noqa E402

passive_aggressive_clf = PassiveAggressiveClassifier(random_state=42)
passive_aggressive_clf.fit(X_train, y_train)

print('Score - Training: {:f}'.format(passive_aggressive_clf.score(
    X_train, y_train)))
print('Score - Test: {:f}'.format(passive_aggressive_clf.score(
    X_test, y_test)))

Score - Training: 0.462122
Score - Test: 0.461110


In [8]:
# %load -r 38-44 ../scripts/model.py
from sklearn.tree import DecisionTreeClassifier  # noqa E402

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

print('Score - Training: {:f}'.format(tree_clf.score(X_train, y_train)))
print('Score - Test: {:f}'.format(tree_clf.score(X_test, y_test)))

Score - Training: 0.843081
Score - Test: 0.603513


In [9]:
# %load -r 46-51 ../scripts/model.py
from sklearn.linear_model import RidgeClassifier  # noqa E402

ridge_clf = RidgeClassifier().fit(X_train, y_train)

print('Score - Training: {:f}'.format(ridge_clf.score(X_train, y_train)))
print('Score - Test: {:f}'.format(ridge_clf.score(X_test, y_test)))

Score - Training: 0.609316
Score - Test: 0.609078


In [10]:
# %load -r 53-59 ../scripts/model.py
from sklearn.ensemble import RandomForestClassifier  # noqa E402

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_test, y_test)

print('Score - Training: {:f}'.format(forest_clf.score(X_train, y_train)))
print('Score - Test: {:f}'.format(forest_clf.score(X_test, y_test)))

Score - Training: 0.597416
Score - Test: 0.926904


In [11]:
# %load -r 61-67 ../scripts/model.py
from sklearn.ensemble import AdaBoostClassifier  # noqa E402

ada_clf = AdaBoostClassifier(random_state=42)
ada_clf.fit(X_test, y_test)

print('Score - Training: {:f}'.format(ada_clf.score(X_train, y_train)))
print('Score - Test: {:f}'.format(ada_clf.score(X_test, y_test)))

Score - Training: 0.615408
Score - Test: 0.615531


In [12]:
# %load -r 69-75 ../scripts/model.py
from sklearn.neural_network import MLPClassifier  # noqa E402

mlp_clf = MLPClassifier(random_state=42)
mlp_clf.fit(X_test, y_test)

print('Score - Training: {:f}'.format(mlp_clf.score(X_train, y_train)))
print('Score - Test: {:f}'.format(mlp_clf.score(X_test, y_test)))

Score - Training: 0.616754
Score - Test: 0.617419


In [13]:
# %load -r 12-15 ../scripts/utils.py
def display_scores(name, scores):
    print(name)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())

In [14]:
# %load -r 78 ../scripts/model.py
from sklearn.model_selection import cross_val_score  # noqa E402

In [15]:
# %load -r 80-99 ../scripts/model.py
log_reg_scores = cross_val_score(log_reg, X, y,
                                 scoring="neg_mean_squared_error")
log_reg_rmse_scores = np.sqrt(-log_reg_scores)
display_scores('Logistic Regression', log_reg_rmse_scores)

passive_aggressive_clf_scores =\
    cross_val_score(passive_aggressive_clf, X, y,
                    scoring="neg_mean_squared_error")
passive_aggressive_clf_rmse_scores = np.sqrt(-passive_aggressive_clf_scores)
display_scores('Passive Aggressive', passive_aggressive_clf_rmse_scores)

tree_clf_scores = cross_val_score(tree_clf, X, y,
                                  scoring="neg_mean_squared_error")
tree_clf_rmse_scores = np.sqrt(-tree_clf_scores)
display_scores('Decision Tree', tree_clf_rmse_scores)

ridge_clf_scores = cross_val_score(ridge_clf, X, y,
                                   scoring="neg_mean_squared_error")
ridge_clf_rmse_scores = np.sqrt(-ridge_clf_scores)
display_scores('Ridge', ridge_clf_rmse_scores)

Logistic Regression
Mean: 0.6252493107442061
Standard deviation: 0.000978938868035884
Passive Aggressive
Mean: 0.6534917702559954
Standard deviation: 0.009340231457350313
Decision Tree
Mean: 0.6671820648102167
Standard deviation: 0.00339353095454288
Ridge
Mean: 0.6251438968653231
Standard deviation: 0.0010335930615780144


In [26]:
# %load -r 101-104 ../scripts/model.py
forest_clf_scores = cross_val_score(forest_clf, X, y,
                                    scoring="neg_mean_squared_error")
forest_clf_rmse_scores = np.sqrt(-forest_clf_scores)
display_scores('Forest', forest_clf_rmse_scores)

Forest
Mean: 0.6433891489073945
Standard deviation: 0.006187039807018061


In [17]:
# %load -r 106-109 ../scripts/model.py
ada_clf_scores = cross_val_score(ada_clf, X, y,
                                 scoring="neg_mean_squared_error")
ada_clf_rmse_scores = np.sqrt(-ada_clf_scores)
display_scores('AdaBoost', ada_clf_rmse_scores)

AdaBoost
Mean: 0.6210365172593049
Standard deviation: 0.0027378322442304787


In [25]:
# %load -r 111-114 ../scripts/model.py
mlp_clf_scores = cross_val_score(mlp_clf, X, y,
                                 scoring="neg_mean_squared_error")
mpl_clf_rmse_scores = np.sqrt(-mlp_clf_scores)
display_scores('MLP', mpl_clf_rmse_scores)

MLP
Mean: 0.6214307642400999
Standard deviation: 0.0031723727714660322


In [9]:
# %load -r 120-133 ../scripts/model.py
from sklearn.model_selection import GridSearchCV  # noqa E402

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

param_grid = {'C': np.logspace(-3, 3, 7),
              'penalty': ['l2', 'none'],
              'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}

grid_search = GridSearchCV(LogisticRegression(),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train, y_train)

print('Best parameters', grid_search.best_params_)
print('Best score', grid_search.best_score_)

Best parameters {'C': 10.0, 'penalty': 'none', 'solver': 'saga'}
Best score -0.39076315813283885


In [10]:
# %load -r 135-143 ../scripts/model.py
from sklearn.metrics import mean_squared_error  # noqa E402

final_model = grid_search.best_estimator_

final_predictions = final_model.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print('Final score', final_rmse)

Final score 0.6253167742436414


In [14]:
# pickle the model
import joblib  # noqa E402

joblib.dump(grid_search.best_estimator_, '../data/final/final_model.pkl')

['../data/final/final_model.pkl']