In [70]:
#Lab 8 - Charles Voigt
#Housekeeping
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from ISLP.models import ModelSpec as MS
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text) #DTC for classification, DTR for regression, plot_tree to visualize tree, export_text to get rules
from sklearn.metrics import (accuracy_score,
                             log_loss) #accuracy_score for classification accuracy, log_loss for cross-entropy loss
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR,
      GradientBoostingClassifier as GBC) #RF for random forest, GBR for boosting
from ISLP.bart import BART #Bayesian Additive Regression Trees
import warnings
warnings.filterwarnings('ignore')


In [13]:
#Question 1
hitters = load_data('Hitters')
hitters = hitters.dropna()

model = MS(hitters.columns.drop('Salary'), intercept=False) 
D = model.fit_transform(hitters) 
feature_names = list(D.columns) 
X = np.asarray(D)

(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                hitters['Salary'],
                                test_size=0.3,
                                random_state=42)

bag_hitters = RF(max_features=X_train.shape[1],
                n_estimators=300,
                random_state=42).fit(X_train, y_train) 
y_hat_bag = bag_hitters.predict(X_test) 
test_mse = np.round(np.mean((y_test - y_hat_bag)**2), 2)
test_mse

np.float64(120962.79)

#### Question 1 Discussion
The test MSE of the bagging Random Forest Model is 120962.79

In [19]:
#Question 2
RF_hitters = RF(max_features= 5,
                n_estimators=300,
                random_state=42).fit(X_train, y_train) 
y_hat_rf = RF_hitters.predict(X_test) 

feature_imp = pd.DataFrame(
    {'importance':RF_hitters.feature_importances_},
    index=feature_names)
feature_imp.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
CHits,0.141982
CAtBat,0.11709
CRBI,0.113281
CRuns,0.101749
CHmRun,0.074938
CWalks,0.067593
PutOuts,0.057856
RBI,0.051779
AtBat,0.050193
Runs,0.047026


#### Question 2 Discussion
CHits has the highest importance score

In [56]:
#Question 3
oj = load_data('OJ')
oj['Purchase'].value_counts()

CH = np.where(oj.Purchase == 'CH',
                "Yes",
                "No")

In [60]:
#Question 3 (continued)
model = MS(oj.columns.drop('Purchase'), intercept=False)
D = model.fit_transform(oj)
feature_names = list(D.columns) 
X = np.asarray(D)
y = CH

(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                y,
                                test_size=0.25,
                                random_state=42)

dtc = DTC(criterion='entropy',max_depth=4,random_state=42)      
dtc.fit(X_train, y_train)
testaccuracy = np.round(accuracy_score(y_test, dtc.predict(X_test)), 3)
testaccuracy


np.float64(0.799)

#### Question 3 Discussion
The test accuracy is 0.799.

In [None]:
#Question 4
auto = load_data('Auto')
auto = auto.dropna()
mpg_high = np.where(auto.mpg > auto.mpg.median(),
                    "Yes",
                    "No")
model = MS(auto.columns.drop(['mpg']), intercept=False)
D = model.fit_transform(auto)
feature_names = list(D.columns)
X = np.asarray(D)
y = mpg_high
(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                y,
                                test_size=0.3,
                                random_state=42)

boost_auto = GBC(n_estimators=200,
                   learning_rate=0.1,
                   max_depth=3,
                   random_state=42) 
testaccuracy = np.round(accuracy_score(y_test, boost_auto.predict(X_test)), 3)
testaccuracy

np.float64(0.89)

#### Question 4 Discussion 
The test accuracy of the GBC is 0.89

In [78]:
#Question 5
hitters = load_data('Hitters')
hitters = hitters.dropna()
model = MS(hitters.columns.drop('Salary'), intercept=False)
D = model.fit_transform(hitters)
feature_names = list(D.columns)
X = np.asarray(D)
y = hitters['Salary']
(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                y,
                                test_size=0.3,
                                random_state=42)

reg = DTR(max_depth=6, random_state=42) 
reg.fit(X_train, y_train) 

ccp_path = reg.cost_complexity_pruning_path(X_train, y_train)
kfold = skm.KFold(5,
                  shuffle=True,
                  random_state=42) 
grid = skm.GridSearchCV(reg,
                        {'ccp_alpha': ccp_path.ccp_alphas},
                        refit=True,
                        cv=kfold, 
                        scoring='neg_mean_squared_error')
G = grid.fit(X_train, y_train)
best = G.best_estimator_
best

In [80]:
#Question 5 (continued)
leaf_nodes = best.get_n_leaves()
test_mse = np.round(np.mean((y_test - best.predict(X_test))**2), 2)
print(f'Number of leaf nodes: {leaf_nodes}'
      f'\nTest MSE: {test_mse}')

Number of leaf nodes: 4
Test MSE: 161392.55


#### Question 5 Discussion
The best estimator yields 4 leaf nodes and a test MSE of 161392.55