## Supervised - Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor

In [None]:
boston = datasets.load_boston()

In [47]:
#Split and standardize data 
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [48]:
# For comparing rmse's across models. You could calculate another error metric or use sklearn's .score which outputs R^2 
def rmse(true, predicted):
    return np.sqrt(np.mean((true - predicted) ** 2))

In [49]:
# Decision Tree
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
print('Decision Tree RMSE: {}'.format(rmse(dt_preds, y_test)))

#documentation: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

Decision Tree RMSE: 4.174302714179495


In [50]:
# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print('Random Forest RMSE: {}'.format(rmse(rf_preds, y_test)))

#documentation: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

Random Forest RMSE: 3.1322610132487543


In [51]:
# Bagging
bag = BaggingRegressor()
bag.fit(X_train, y_train)
bag_preds = bag.predict(X_test)
print('Bagging RMSE: {}'.format(rmse(bag_preds, y_test)))

#documentation: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html

Bagging RMSE: 3.5952272605661673


In [52]:
# K Nearest Neighbors
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
print('KNN RMSE: {}'.format(rmse(knn_preds, y_test)))

#documentation: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

KNN RMSE: 4.55045243056497


In [53]:
# Gradient Boosting
boost = GradientBoostingRegressor()
boost.fit(X_train, y_train)
boost_preds = boost.predict(X_test)

print('Gradient Boosting RMSE: {}'.format(rmse(boost_preds, y_test)))
#documentation: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

Gradient Boosting RMSE: 2.797881262738919


In [54]:
# Adaboost
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)
ada_preds = ada.predict(X_test)

print('AdaBoost RMSE: {}'.format(rmse(ada_preds, y_test)))
#documentation: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html

AdaBoost RMSE: 3.6367482080683726


## Supervised - Classification

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import KFold, GridSearchCV

In [56]:
cancer = datasets.load_breast_cancer()

In [58]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [59]:
# Decision tree
dt = DecisionTreeClassifier(criterion="gini", splitter="best")
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)

print('Decision Tree Accuracy: {}'.format(accuracy_score(dt_preds, y_test)))
print('Decision Tree Recall: {}'.format(recall_score(dt_preds, y_test)))
print('Decision Tree Precision: {}'.format(precision_score(dt_preds, y_test)))

Decision Tree Accuracy: 0.9370629370629371
Decision Tree Recall: 0.9550561797752809
Decision Tree Precision: 0.9444444444444444


In [60]:
# Random Forest
rf = RandomForestClassifier(n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)

rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print('Random Forest Accuracy: {}'.format(accuracy_score(rf_preds, y_test)))
print('Random Forest Recall: {}'.format(recall_score(rf_preds, y_test)))
print('Random Forest Precision: {}'.format(precision_score(rf_preds, y_test)))

Random Forest Accuracy: 0.986013986013986
Random Forest Recall: 1.0
Random Forest Precision: 0.9777777777777777


In [61]:
# Bagging
bag = BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=None, verbose=0)

bag.fit(X_train, y_train)
bag_preds = bag.predict(X_test)

print('Bagging Accuracy: {}'.format(accuracy_score(bag_preds, y_test)))
print('Bagging Recall: {}'.format(recall_score(bag_preds, y_test)))
print('Bagging Precision: {}'.format(precision_score(bag_preds, y_test)))

Bagging Accuracy: 0.9790209790209791
Bagging Recall: 0.9887640449438202
Bagging Precision: 0.9777777777777777


In [62]:
# K Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5, weights="uniform", algorithm="auto", leaf_size=30, p=2, metric="minkowski", metric_params=None, n_jobs=1)

knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)

print('KNN Accuracy: {}'.format(accuracy_score(knn_preds, y_test)))
print('KNN Recall: {}'.format(recall_score(knn_preds, y_test)))
print('KNN Precision: {}'.format(precision_score(knn_preds, y_test)))

KNN Accuracy: 0.9790209790209791
KNN Recall: 0.978021978021978
KNN Precision: 0.9888888888888889


In [63]:
# Gradient Boosting
boost = GradientBoostingClassifier(loss="deviance", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion="friedman_mse", min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort="auto")

boost.fit(X_train, y_train)
boost_preds = boost.predict(X_test)
print('Gradient Boosting Accuracy: {}'.format(accuracy_score(boost_preds, y_test)))
print('Gradient Boosting Recall: {}'.format(recall_score(boost_preds, y_test)))
print('Gradient Boosting Precision: {}'.format(precision_score(boost_preds, y_test)))

Gradient Boosting Accuracy: 0.986013986013986
Gradient Boosting Recall: 0.9888888888888889
Gradient Boosting Precision: 0.9888888888888889


In [64]:
# Adaboost
ada = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm="SAMME.R", random_state=None)
ada.fit(X_train, y_train)
ada_preds = ada.predict(X_test)
print('AdaBoost Accuracy: {}'.format(accuracy_score(ada_preds, y_test)))
print('AdaBoost Recall: {}'.format(recall_score(ada_preds, y_test)))
print('AdaBoost Precision: {}'.format(precision_score(ada_preds, y_test)))

AdaBoost Accuracy: 0.993006993006993
AdaBoost Recall: 1.0
AdaBoost Precision: 0.9888888888888889


In [66]:
# Decide Best model
# Grid search for best params

# ada example
ada_boost_grid = {'n_estimators': [50, 100, 150, 200],
                      'random_state': [1, None],
                      'learning_rate': [0.1, .5, 1]}

ada_gridsearch = GridSearchCV(AdaBoostClassifier(),
                             ada_boost_grid,
                             n_jobs=-1,
                             verbose=True)
ada_gridsearch.fit(X_train, y_train)

best_ada_model = ada_gridsearch.best_estimator_
best_ada_model.fit(X_train, y_train)
best_ada_preds = best_ada_model.predict(X_test)

print("Best ADA Accuracy: {}".format(accuracy_score(best_ada_preds, y_test)))
print("Best ADA Recall: {}".format(recall_score(best_ada_preds, y_test)))
print("Best ADA Precision: {}".format(precision_score(best_ada_preds, y_test)))

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   12.1s finished


Best ADA Accuracy: 0.993006993006993
Best ADA Recall: 1.0
Best ADA Precision: 0.9888888888888889


## Unsupervised

In [4]:
from sklearn.cluster import KMeans

In [5]:
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target

In [6]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
#k Means
k_means = KMeans(n_clusters=3) 
k_means.fit(X_train)
y_preds = k_means.predict(X_test)
print('Actual labels for training set: {}'.format(y_train))
print('Predicted labels for training set: {}'.format(k_means.labels_))

print('Actual labels for test set: {}'.format(y_test))
print('Predicted labels for training set: {}'.format(y_preds))

Actual labels for training set: [2 1 1 0 1 1 1 0 1 2 1 2 0 2 2 2 1 1 2 0 1 2 2 1 2 2 1 2 1 2 2 0 0 2 2 0 0
 0 2 2 1 1 1 1 2 2 1 2 0 0 2 0 1 2 1 1 0 2 1 0 0 2 1 2 2 1 1 2 1 1 1 0 2 1
 1 2 0 1 0 1 1 0 1 2 0 1 2 1 2 1 0 1 0 2 0 2 1 1 2 2 0 0 1 0 2 1 1 0 2 1 0
 0]
Predicted labels for training set: [0 1 1 2 0 0 0 2 1 1 0 1 2 1 1 0 0 0 1 2 1 1 1 1 1 1 0 1 0 1 1 2 2 1 1 2 2
 2 0 0 0 0 0 1 0 0 0 0 2 2 1 2 0 0 0 0 2 1 0 2 2 1 0 1 1 0 0 0 1 0 0 2 1 0
 0 1 2 0 2 0 0 2 1 1 2 0 0 0 1 0 2 1 2 0 2 1 0 1 1 1 2 2 1 2 0 1 0 2 1 0 2
 2]
Actual labels for test set: [1 0 0 0 0 0 0 1 0 0 0 2 0 2 2 1 2 1 2 0 0 0 0 0 0 2 0 2 0 2 2 2 1 0 0 0 2
 1]
Predicted labels for training set: [0 2 2 2 2 2 2 0 2 2 2 1 2 1 1 0 1 0 0 2 2 2 2 2 2 1 2 1 2 0 1 1 0 2 2 2 1
 0]
(38,)
(38,)
