# Conceptual Questions

1. If a decision tree is under-fitting the training dataset, is it a good idea to try scaling
the input features?

No. Underfitting is a symptom of a problem with the models complexity. Where scaling the data deals with the magnitude / range of feature values. So scaling the data wouldn't really provide assistance with the models complexity.

2. If a decision tree is over-fitting the training dataset, is it a good idea to try decreasing
max depth?

Yes. Reducing the max_depth decreases how deep the tree can grow, which affects the complexity of the model. If a model is overfitting, then reducing the max_depth can decrease the complexity. Which can assist in problems with over-fitting.

3. Why would you use a random forest instead of a decision tree?

(f) (b) and (c)

4. Which of the following is/are TRUE about bagging trees?

(d) (a) and (c)


In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, recall_score
from sklearn.ensemble import BaggingRegressor, RandomForestClassifier, ExtraTreesClassifier
# Exercise 5
print("Exercise 5:")
best_rmse = float('inf')
best_model = None
# a
college = pd.read_csv('College.csv')
# b
college['Private'] = college['Private'].map({'Yes': 1, 'No': 0})
# c
X = college[['Private', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'S.F.Ratio', 'Grad.Rate']]
y = college['Apps']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# d
linReg_md = make_pipeline(StandardScaler(), LinearRegression())
linReg_md.fit(X_train, y_train)
linReg_pred = linReg_md.predict(X_test)
linReg_mse = mean_squared_error(y_test, linReg_pred)
linReg_rmse=np.sqrt(linReg_mse)
print(f"The RMSE of the Linear Regression Model is {linReg_rmse:.2f}")
best_rmse=linReg_rmse
best_model="Linear Regression"
# e
bag_linReg_md=BaggingRegressor(estimator=linReg_md, n_estimators=50, random_state=42)
bag_linReg_md.fit(X_train, y_train)
bag_linReg_pred=bag_linReg_md.predict(X_test)
bag_lin_Reg_mse=mean_squared_error(y_test, bag_linReg_pred)
bag_lin_Reg_rmse=np.sqrt(bag_lin_Reg_mse)
print(f"The RMSE of the Bagged Linear Regression model is {bag_lin_Reg_rmse:.2f}")
if bag_lin_Reg_rmse < best_rmse:
    best_rmse = bag_lin_Reg_rmse
    best_model = "bagged linear regression"
# f
ridge_lambda=[]
lambdas_to_consider=np.linspace(0.001, 100, num = 100)
ridge_cv = RidgeCV(alphas=lambdas_to_consider, cv=5)
ridge_cv.fit(X_train, y_train)
optimal_lambda = ridge_cv.alpha_
ridge_md = make_pipeline(StandardScaler(), Ridge(alpha=optimal_lambda))
ridge_md.fit(X_train, y_train)
ridge_pred = ridge_md.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = np.sqrt(ridge_mse)
print(f'The RMSE of the Ridge model is {ridge_rmse:.2f}')
if ridge_rmse < best_rmse:
    best_rmse = ridge_rmse
    best_model = "Ridge regression"
# g
bag_ridge_md=BaggingRegressor(estimator=ridge_md, n_estimators=50, random_state=42)
bag_ridge_md.fit(X_train, y_train)
bag_ridge_pred=bag_ridge_md.predict(X_test)
bag_ridge_mse=mean_squared_error(y_test, bag_ridge_pred)
bag_ridge_rmse=np.sqrt(bag_ridge_mse)
print(f"The RMSE of the Bagged Ridge model is {bag_ridge_rmse:.2f}")
if bag_ridge_rmse < best_rmse:
    best_rmse = bag_ridge_rmse
    best_model = "Bagged Ridge"
# h
lasso_cv = LassoCV(alphas=np.linspace(0.001, 100, num=100), cv=5)
lasso_cv.fit(X_train, y_train)
optimal_lambda = lasso_cv.alpha_
lasso_md = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Lasso(alpha=optimal_lambda))
])
lasso_md.fit(X_train, y_train)
lasso_md_pred=lasso_md.predict(X_test)
lasso_md_mse=mean_squared_error(y_test, lasso_md_pred)
lasso_md_rmse=np.sqrt(lasso_md_mse)
print(f"The RMSE of the Lasso model is {lasso_md_rmse:.2f}")
if lasso_md_rmse < best_rmse:
    best_rmse = lasso_md_rmse
    best_model = "Lasso"
# i
bag_lasso_md = BaggingRegressor(estimator=lasso_md, n_estimators=50, random_state=42)
bag_lasso_md.fit(X_train, y_train)
bag_lasso_pred = bag_lasso_md.predict(X_test)
bag_lasso_mse = mean_squared_error(y_test, bag_lasso_pred)
bag_lasso_rmse = np.sqrt(bag_lasso_mse)
print(f"The RMSE of the Bagged LASSO model is {bag_lasso_rmse:.2f}")
if bag_lasso_rmse < best_rmse:
    best_rmse = bag_lasso_rmse
    best_model = "Bagged LASSO"
# j
print(f"I would use the {best_model} model with the lowest RMSE of {best_rmse:.2f}")

Exercise 5:
The RMSE of the Linear Regression Model is 1660.03
The RMSE of the Bagged Linear Regression model is 1654.50
The RMSE of the Ridge model is 1662.88
The RMSE of the Bagged Ridge model is 1657.15
The RMSE of the Lasso model is 1660.03
The RMSE of the Bagged LASSO model is 1654.50
I would use the bagged linear regression model with the lowest RMSE of 1654.50


In [52]:
# Exercise 6
print("Exercise 6:")
# a
heart = pd.read_csv('framingham.csv')
# b
heart.dropna(inplace=True)
# ci
X = heart.drop('TenYearCHD', axis=1)
y = heart['TenYearCHD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# cii
rf_classifier = RandomForestClassifier(n_estimators=500, random_state=42)
rf_classifier.fit(X_train, y_train)
# ciii
feature_importance = rf_classifier.feature_importances_
# Part c repetition loop
feat_imps = []

for _ in tqdm(range(100)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    rf_classifier = RandomForestClassifier(n_estimators=500, random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Extract feature importance and store it
    feat_imp = rf_classifier.feature_importances_
    feat_imps.append(feat_imp)

avg_imps = np.mean(feat_imps, axis=0)
sorted_indices = np.argsort(avg_imps)[::-1]
top_5_indices = sorted_indices[:5]
top_5_features = X.columns[top_5_indices]

print("Top 5 predictor variables based on average importance:")
for feature in top_5_features:
    print(feature)
# di
X_top5 = heart[top_5_features]
y = heart['TenYearCHD']
X_train, X_test, y_train, y_test = train_test_split(X_top5, y, test_size=0.2, stratify=y)
# dii
rf_classifier_1 = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=42)
rf_classifier_1.fit(X_train, y_train)
y_pred_proba_1 = rf_classifier_1.predict_proba(X_test)[:, 1]
pred_label_1=np.where(y_pred_proba_1<0.1,0,1)
recall_1 = recall_score(y_test, pred_label_1)
print("Recall of RF 1:", recall_1)
# diii
rf_classifier_2 = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)
rf_classifier_2.fit(X_train, y_train)
y_pred_proba_2 = rf_classifier_2.predict_proba(X_test)[:, 1]
pred_label_2 = np.where(y_pred_proba_2 < 0.1, 0, 1)
recall_2 = recall_score(y_test, pred_label_2)
print("Recall of RF 2:", recall_2)
# div
rf_classifier_4 = RandomForestClassifier(n_estimators=500, max_depth=7, random_state=42)
rf_classifier_4.fit(X_train, y_train)
y_pred_proba_4 = rf_classifier_4.predict_proba(X_test)[:, 1]
pred_label_4 = np.where(y_pred_proba_4 < 0.1, 0, 1)
recall_4 = recall_score(y_test, pred_label_4)
print("Recall of RF 4:", recall_4)
# dv
et_classifier_1 = ExtraTreesClassifier(n_estimators=500, max_depth=3, random_state=42)
et_classifier_1.fit(X_train, y_train)
y_pred_proba_et_1 = et_classifier_1.predict_proba(X_test)[:, 1]
pred_label_et_1 = np.where(y_pred_proba_et_1 < 0.1, 0, 1)
recall_et_1 = recall_score(y_test, pred_label_et_1)
print("Recall of RF 5 (ExtraTrees):", recall_et_1)
# dvi
et_classifier_2 = ExtraTreesClassifier(n_estimators=500, max_depth=5, random_state=42)
et_classifier_2.fit(X_train, y_train)
y_pred_proba_et_2 = et_classifier_2.predict_proba(X_test)[:, 1]
pred_label_et_2 = np.where(y_pred_proba_et_2 < 0.1, 0, 1)
recall_et_2 = recall_score(y_test, pred_label_et_2)
print("Recall of ET 2 (ExtraTrees):", recall_et_2)
# dvii
et_classifier_3 = ExtraTreesClassifier(n_estimators=500, max_depth=7, random_state=42)
et_classifier_3.fit(X_train, y_train)
y_pred_proba_et_3 = et_classifier_3.predict_proba(X_test)[:, 1]
pred_label_et_3 = np where(y_pred_proba_et_3 < 0.1, 0, 1)
recall_et_3 = recall_score(y_test, pred_label_et_3)
print("Recall of ET 3 (ExtraTrees):", recall_et_3)
# Part d repetition loop
avg_recall_rf1 = 0
avg_recall_rf2 = 0
avg_recall_rf4 = 0
avg_recall_et1 = 0
avg_recall_et2 = 0
avg_recall_et3 = 0
for _ in range(100):
    X_top5 = heart[top_5_features]
    y = heart['TenYearCHD']
    X_train, X_test, y_train, y_test = train_test_split(X_top5, y, test_size=0.2, stratify=y)
    # rf_1
    rf_classifier_1 = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=42)
    rf_classifier_1.fit(X_train, y_train)
    y_pred_proba_1 = rf_classifier_1.predict_proba(X_test)[:, 1]
    pred_label_1 = np.where(y_pred_proba_1 < 0.1, 0, 1)
    recall_1 = recall_score(y_test, pred_label_1)
    avg_recall_rf1 += recall_1
    # rf_2
    rf_classifier_2 = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)
    rf_classifier_2.fit(X_train, y_train)
    y_pred_proba_2 = rf_classifier_2.predict_proba(X_test)[:, 1]
    pred_label_2 = np.where(y_pred_proba_2 < 0.1, 0, 1)
    recall_2 = recall_score(y_test, pred_label_2)
    avg_recall_rf2 += recall_2
    # rf_3
    rf_classifier_4 = RandomForestClassifier(n_estimators=500, max_depth=7, random_state=42)
    rf_classifier_4.fit(X_train, y_train)
    y_pred_proba_4 = rf_classifier_4.predict_proba(X_test)[:, 1]
    pred_label_4 = np.where(y_pred_proba_4 < 0.1, 0, 1)
    recall_4 = recall_score(y_test, pred_label_4)
    avg_recall_rf4 += recall_4
    # et_1
    et_classifier_1 = ExtraTreesClassifier(n_estimators=500, max_depth=3, random_state=42)
    et_classifier_1.fit(X_train, y_train)
    y_pred_proba_et_1 = et_classifier_1.predict_proba(X_test)[:, 1]
    pred_label_et_1 = np.where(y_pred_proba_et_1 < 0.1, 0, 1)
    recall_et_1 = recall_score(y_test, pred_label_et_1)
    avg_recall_et1 += recall_et_1
    # et_2
    et_classifier_2 = ExtraTreesClassifier(n_estimators=500, max_depth=5, random_state=42)
    et_classifier_2.fit(X_train, y_train)
    y_pred_proba_et_2 = et_classifier_2.predict_proba(X_test)[:, 1]
    pred_label_et_2 = np.where(y_pred_proba_et_2 < 0.1, 0, 1)
    recall_et_2 = recall_score(y_test, pred_label_et_2)
    avg_recall_et2 += recall_et_2
    # et_3
    et_classifier_3 = ExtraTreesClassifier(n_estimators=500, max_depth=7, random_state=42)
    et_classifier_3.fit(X_train, y_train)
    y_pred_proba_et_3 = et_classifier_3.predict_proba(X_test)[:, 1]
    pred_label_et_3 = np.where(y_pred_proba_et_3 < 0.1, 0, 1)
    recall_et_3 = recall_score(y_test, pred_label_et_3)
    avg_recall_et3 += recall_et_3
# Calculate the average recall across 100 iterations
avg_recall_rf1 /= 100
avg_recall_rf2 /= 100
avg_recall_rf4 /= 100
avg_recall_et1 /= 100
avg_recall_et2 /= 100
avg_recall_et3 /= 100
print("Average Recall values for 100 loops:")
print("Average Recall of RF 1:", avg_recall_rf1)
print("Average Recall of RF 2:", avg_recall_rf2)
print("Average Recall of RF 4:", avg_recall_rf4)
print("Average Recall of ET 1 (ExtraTrees):", avg_recall_et1)
print("Average Recall of ET 2 (ExtraTrees):", avg_recall_et2)
print("Average Recall of ET 3 (ExtraTrees):", avg_recall_et3)
best_model = None
highest_avg_recall = max(
    avg_recall_rf1, avg_recall_rf2, avg_recall_rf4, avg_recall_et1, avg_recall_et2, avg_recall_et3
)
if highest_avg_recall == avg_recall_rf1:
    best_model = "Random Forest 1"
elif highest_avg_recall == avg_recall_rf2:
    best_model = "Random Forest 2"
elif highest_avg_recall == avg_recall_rf4:
    best_model = "Random Forest 4"
elif highest_avg_recall == avg_recall_et1:
    best_model = "Extra Trees 1"
elif highest_avg_recall == avg_recall_et2:
    best_model = "Extra Trees 2"
elif highest_avg_recall == avg_recall_et3:
    best_model = "Extra Trees 3"
print("The model to predict TenYearCHD with the highest average recall with 100 iterations is:", best_model)




Exercise 6:


100%|██████████| 100/100 [04:24<00:00,  2.65s/it]

Top 5 predictor variables based on average importance:
sysBP
BMI
age
totChol
glucose



