# General Overview - Machine Learning

The goal of the machine learning model is to accurately predict a tree's health based on the feature variables. This is a classification model. To measure the model's success, we use [classification reports](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html), which displays the precision, recall, and f1-score of the model.

We are building with [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html), [KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html), and [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html). To create balanced classes, we use random oversampler, which replicates samples from the minority classes till they are equal to the majority.

In [15]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (StratifiedKFold, cross_val_score, GridSearchCV, train_test_split)
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import imblearn
from imblearn.over_sampling import RandomOverSampler



In [None]:
np.random.seed(42)

In [None]:
# import data
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/tree_ml.csv', index_col=0)

tree = data.copy()

In [None]:
tree.head()

Unnamed: 0,tree_dbh,curb_loc,health,sidewalk,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,longitude,latitude,num_problems,1or2,3or4,4orMore,Stew_N,Guard_N,Harmful,Helpful,Unsure,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,3,1,Fair,0,0,0,0,0,0,0,0,0,0,-73.844215,40.723092,0,0,0,0,1,1,0,0,0,0,0,0,1,0
1,21,1,Fair,1,1,0,0,0,0,0,0,0,0,-73.818679,40.794111,1,0,0,0,1,1,0,0,0,0,0,0,1,0
2,3,1,Good,1,0,0,0,0,0,0,0,0,0,-73.936608,40.717581,0,1,0,0,0,1,0,0,0,0,1,0,0,0
3,10,1,Good,1,1,0,0,0,0,0,0,0,0,-73.934456,40.713537,1,0,0,0,1,1,0,0,0,0,1,0,0,0
4,21,1,Good,1,1,0,0,0,0,0,0,0,0,-73.975979,40.666778,1,0,0,0,1,1,0,0,0,0,1,0,0,0


In [None]:
tree.shape

(651535, 29)

# Modeling

In [None]:
X = tree.drop('health', axis=1)
y = tree['health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(456074, 28) (456074,)
(195461, 28) (195461,)


## Random Over Sampler

In [None]:
rs = RandomOverSampler(random_state=42)
X_rs, y_rs = rs.fit_resample(X, y)

print('Resampled dataset:', Counter(y_rs))

X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)



Resampled dataset: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})
(1188735, 28) (1188735,)
(396246, 28) (396246,)


### Logistic Regression

In [None]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_rs, y_train_rs)
logreg_pred = logreg.predict(X_test_rs)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set: ', logreg.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n')
print(classification_report(y_test_rs, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.4163938977147977
Accuracy Score, Test Set:  0.41573668882461906
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.17      0.24    132500
        Good       0.41      0.55      0.47    131527
        Poor       0.43      0.53      0.47    132219

    accuracy                           0.42    396246
   macro avg       0.41      0.42      0.39    396246
weighted avg       0.41      0.42      0.39    396246



## KNN Classifier

In [None]:
# GridSearch - this takes a while to run
knn = KNeighborsClassifier()
parameters = {'n_neighbors': [10, 15]}

clf = GridSearchCV(knn, parameters, cv=5, verbose=1, n_jobs=-1)
clf.fit(X_rs, y_rs).best_params_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 219.6min finished


{'n_neighbors': 10}

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train_rs, y_train_rs)
knn_pred = knn.predict(X_test_rs)

# accuracy scoring
print('Accuracy Score, Training Set: ', knn.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set: ', knn.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n\n {}'.format(classification_report(y_test_rs, knn_pred)))

Accuracy Score, Training Set:  0.8079832763399748
Accuracy Score, Test Set:  0.759207663925945
Classification Report 

               precision    recall  f1-score   support

        Fair       0.68      0.77      0.72    132500
        Good       0.78      0.51      0.62    131527
        Poor       0.82      1.00      0.90    132219

    accuracy                           0.76    396246
   macro avg       0.76      0.76      0.75    396246
weighted avg       0.76      0.76      0.75    396246



## Decision Tree Classifier

In [None]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_rs, y_train_rs)
decision_tree_pred = decision_tree.predict(X_test_rs)

# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n\n {}'.format(classification_report(y_test_rs, decision_tree_pred)))

Accuracy Score, Training Set: 0.9999882227746302
Accuracy Score, Test Set: 0.9386214624248573
Classification Report 

               precision    recall  f1-score   support

        Fair       0.88      0.99      0.93    132500
        Good       0.99      0.82      0.90    131527
        Poor       0.97      1.00      0.98    132219

    accuracy                           0.94    396246
   macro avg       0.94      0.94      0.94    396246
weighted avg       0.94      0.94      0.94    396246



## Random Forest Classifier

In [None]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train_rs, y_train_rs)
y_pred = forest.predict(X_test_rs)

# accuracy scores
print('Accuracy Score, Training Set:', forest.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', forest.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n\n {}'.format(classification_report(y_test_rs, y_pred)))

Accuracy Score, Training Set: 0.9999747630884932
Accuracy Score, Test Set: 0.9533597815498454
Classification Report 

               precision    recall  f1-score   support

        Fair       0.90      0.99      0.94    132500
        Good       0.99      0.87      0.93    131527
        Poor       0.98      1.00      0.99    132219

    accuracy                           0.95    396246
   macro avg       0.96      0.95      0.95    396246
weighted avg       0.96      0.95      0.95    396246



In [None]:
# stratified KFold
kf = StratifiedKFold(5, shuffle=True, random_state=42)

# cross validation
forest_score = cross_val_score(forest, X_rs, y_rs, cv=kf)

print('Scores: ', forest_score)
print("Average 5-Fold Scores: {}".format(np.mean(forest_score)))

Scores:  [0.9566305  0.95656412 0.95652311 0.95593635 0.95709725]
Average 5-Fold Scores: 0.9565502677443852
