## Supervised Learning Model Comparison

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./401ksubs.csv")

In [3]:
df.head()

Unnamed: 0,e401k,inc,marr,male,age,fsize,nettfa,p401k,pira,incsq,agesq
0,0,13.17,0,0,40,1,4.575,0,1,173.4489,1600
1,1,61.23,0,1,35,1,154.0,1,0,3749.113,1225
2,0,12.858,1,0,44,2,0.0,0,0,165.3282,1936
3,0,98.88,1,1,44,2,21.8,0,0,9777.254,1936
4,0,22.614,0,0,53,1,18.45,0,0,511.393,2809


In [4]:
df.shape

(9275, 11)

In [7]:
scaled_df = pd.DataFrame(StandardScaler().fit_transform(df),
                        columns = df.columns)

In [8]:
scaled_df.head()

Unnamed: 0,e401k,inc,marr,male,age,fsize,nettfa,p401k,pira,incsq,agesq
0,-0.803173,-1.082858,-1.300887,-0.506898,-0.104886,-1.2355,-0.226651,-0.617776,1.712236,-0.648965,-0.216227
1,1.245062,0.912268,-1.300887,1.972784,-0.590372,-1.2355,2.109561,1.618709,-0.584032,0.542404,-0.63494
2,-0.803173,-1.09581,0.768706,-0.506898,0.283503,-0.580086,-0.298179,-0.617776,-0.584032,-0.651671,0.158941
3,-0.803173,2.475242,0.768706,1.972784,0.283503,-0.580086,0.042656,-0.617776,-0.584032,2.550909,0.158941
4,-0.803173,-0.690807,-1.300887,-0.506898,1.157377,-1.2355,-0.00972,-0.617776,-0.584032,-0.536366,1.133706


In [9]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df.drop(columns = ['e401k', 'p401k', 'pira', 'inc', 'incsq']),
                                                    scaled_df['inc'],
                                                    test_size = .2,
                                                    random_state = 42)

In the interest of time, I'm going to use the defauls on all seven models. However, if we wanted to do so, we could GridSearch over parameters of each model.

In [10]:
np.random.seed(42)

### Regression models

In [11]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)

cart_reg = DecisionTreeRegressor()
cart_reg.fit(X_train, y_train)

bagged_reg = BaggingRegressor()
bagged_reg.fit(X_train, y_train)

random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(X_train, y_train)

adaboost_reg = AdaBoostRegressor()
adaboost_reg.fit(X_train, y_train)

support_vector_reg = SVR()
support_vector_reg.fit(X_train, y_train)

  linalg.lstsq(X, y)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
def rmse_score(model, X_train, X_test, y_train, y_test):
    mse_train = mean_squared_error(y_true = y_train,
                                  y_pred = model.predict(X_train))
    mse_test = mean_squared_error(y_true = y_test,
                                  y_pred = model.predict(X_test))
    rmse_train = mse_train ** 0.5
    rmse_test = mse_test ** 0.5
    
    print("The training RMSE for " + str(model) + " is: " + str(rmse_train))
    print("The testing RMSE for " + str(model) + " is: " + str(rmse_test))
    return (rmse_train, rmse_test)

In [14]:
rmse_score(linear_reg, X_train, X_test, y_train, y_test)

The training RMSE for LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) is: 0.8370830332453489
The testing RMSE for LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) is: 0.8675193605893161


(0.8370830332453489, 0.8675193605893161)

In [15]:
rmse_score(knn_reg, X_train, X_test, y_train, y_test)

The training RMSE for KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform') is: 0.6860905782308746
The testing RMSE for KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform') is: 0.8373266164049329


(0.6860905782308746, 0.8373266164049329)

In [16]:
rmse_score(cart_reg, X_train, X_test, y_train, y_test)

The training RMSE for DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best') is: 0.09397820092399986
The testing RMSE for DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best') is: 1.1301554138974865


(0.09397820092399986, 1.1301554138974865)

In [17]:
rmse_score(bagged_reg, X_train, X_test, y_train, y_test)

The training RMSE for BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False) is: 0.3655510831382053
The testing RMSE for BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False) is: 0.8757388842131967


(0.3655510831382053, 0.8757388842131967)

In [18]:
rmse_score(random_forest_reg, X_train, X_test, y_train, y_test)

The training RMSE for RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False) is: 0.37615364861824424
The testing RMSE for RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False) is: 0.8791514381485241


(0.37615364861824424, 0.8791514381485241)

In [19]:
rmse_score(adaboost_reg, X_train, X_test, y_train, y_test)

The training RMSE for AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None) is: 0.8782293359444954
The testing RMSE for AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None) is: 0.9235374302656226


(0.8782293359444954, 0.9235374302656226)

In [20]:
rmse_score(support_vector_reg, X_train, X_test, y_train, y_test)

The training RMSE for SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) is: 0.7858841962924686
The testing RMSE for SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) is: 0.8206550384261724


(0.7858841962924686, 0.8206550384261724)

Putting the training and testing RMSE in a table so we can directly compare them all:

|           Model          | Training RMSE | Testing RMSE |
|:------------------------:|:-------------:|:------------:|
|     Linear Regression    |     0.837     |     0.868    |
|    k-Nearest Neighbors   |     0.686     |     0.837    |
|       Decision Tree      |     0.094     |     1.130    |
|   Bagged Decision Trees  |     0.366     |     0.876    |
|       Random Forest      |     0.376     |     0.879    |
|         AdaBoost         |     0.878     |     0.924    |
| Support Vector Regressor |     0.786     |     0.820    |

### Classification models

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

In [22]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df.drop(columns = ['e401k', 'p401k']),
                                                    [1 if scaled_df['e401k'][i] > 0 else 0 for i in range(scaled_df.shape[0])],
                                                    test_size = .2,
                                                    random_state = 42)

In [23]:
np.random.seed(42)

In [24]:
logreg_class = LogisticRegression()
logreg_class.fit(X_train, y_train)

knn_class = KNeighborsClassifier()
knn_class.fit(X_train, y_train)

cart_class = DecisionTreeClassifier()
cart_class.fit(X_train, y_train)

bagged_class = BaggingClassifier()
bagged_class.fit(X_train, y_train)

random_forest_class = RandomForestClassifier()
random_forest_class.fit(X_train, y_train)

adaboost_class = AdaBoostClassifier()
adaboost_class.fit(X_train, y_train)

support_vector_class = SVC()
support_vector_class.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
def f1_scorer(model, X_train, X_test, y_train, y_test):
    f1_train = f1_score(y_true = y_train,
                        y_pred = model.predict(X_train))
    f1_test = f1_score(y_true = y_test,
                       y_pred = model.predict(X_test))
    
    print("The training F1-score for " + str(model) + " is: " + str(f1_train))
    print("The testing F1-score for " + str(model) + " is: " + str(f1_test))
    return (f1_train, f1_test)

In [27]:
print(f1_scorer(logreg_class, X_train, X_test, y_train, y_test))
print()
print(f1_scorer(knn_class, X_train, X_test, y_train, y_test))
print()
print(f1_scorer(cart_class, X_train, X_test, y_train, y_test))
print()
print(f1_scorer(bagged_class, X_train, X_test, y_train, y_test))
print()
print(f1_scorer(random_forest_class, X_train, X_test, y_train, y_test))
print()
print(f1_scorer(adaboost_class, X_train, X_test, y_train, y_test))
print()
print(f1_scorer(support_vector_class, X_train, X_test, y_train, y_test))

The training F1-score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) is: 0.4727870199219552
The testing F1-score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) is: 0.4773869346733668
(0.4727870199219552, 0.4773869346733668)

The training F1-score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') is: 0.653122648607976
The testing F1-score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_par