<a href="https://colab.research.google.com/github/ath0217/hello-github/blob/main/Lab_Session_07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
sns.set_style("darkgrid")

In [None]:
!mkdir data

In [None]:
import gdown

urls = ['https://drive.google.com/uc?export=download&id=1G5dtIOtywpDBgREbsf7iDVExz6LVg4DM', # heart  https://drive.google.com/file/d/1G5dtIOtywpDBgREbsf7iDVExz6LVg4DM/view?usp=sharing
        'https://drive.google.com/uc?export=download&id=1ULDuvWPWD3LmxpQ6FKUHe1prjdM8LmSY' # hitters https://drive.google.com/file/d/1ULDuvWPWD3LmxpQ6FKUHe1prjdM8LmSY/view?usp=sharing
        ]
outputs = ['heart.csv','hitters.csv']
for url,output in zip(urls,outputs):
  gdown.download(url, f'data/{output}', quiet=False)

**Decision Tree**


In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree

from sklearn.model_selection import KFold,LeaveOneOut
from sklearn.model_selection import train_test_split


from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

from mlxtend.plotting import plot_decision_regions

**Decision tree for regression**

In [None]:
hit = pd.read_csv('data/hitters.csv')
hit.info()

In [None]:
hit.head(3)

In [None]:
hit.dropna(inplace=True)

In [None]:
X= hit[['Years','Hits']]
y = hit['Salary']

In [None]:
sns.scatterplot(data=hit, x='Years', y='Hits', hue='Salary')

In [None]:
dtr = DecisionTreeRegressor(max_depth=2, random_state=714)
dtr.fit(X,y)

In [None]:
plt.figure(figsize=(20,10))
plot_tree(dtr,feature_names=X.columns)

In [None]:
plot_decision_regions(X.values, y.astype(np.integer).values, clf=dtr, legend=None)

**Decision tree for classification**

In [None]:
heart = pd.read_csv('data/heart.csv', usecols=range(1,15))
heart.dropna(inplace=True)
heart.info()

In [None]:
heart.head(3)

In [None]:
X= pd.get_dummies(data=heart.drop(['AHD'],axis=1), columns=['ChestPain','Thal'])
y, levels = heart['AHD'].factorize()

print(levels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.20,random_state=714)

In [None]:
clf = DecisionTreeClassifier(max_depth=3,random_state=714)
clf.fit(X_train,y_train)

In [None]:
plt.figure(figsize=(20,10))
plot_tree(clf,feature_names=X.columns)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

**Simple bagging**

In [None]:
from sklearn.ensemble import BaggingClassifier, BaggingRegressor

In [None]:
X= hit[['Years','Hits']]
y = np.log(hit['Salary'])

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=714)
cv

In [None]:
%%time
B = [5,10,25,50,75,100,200,300,400,500]
mses=[]
cv = KFold(n_splits=10, shuffle=True, random_state=714)
# this is 10-fold CV
for k, (train_index, test_index) in enumerate(cv.split(X,y)):
  mses.append([])
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]      
  # this is running 10 different Baggings with different B
  dtr = DecisionTreeRegressor(random_state=714)
  dtr.fit(X_train,y_train)
  y_pred_dt = dtr.predict(X_test)
  mse_dt = mean_squared_error(y_test,y_pred_dt)
  mses[k].append(mse_dt)
  for b in B:                     
    bag = BaggingRegressor(n_estimators=b)
    bag.fit(X_train, y_train)
    #plotting the results
    y_pred_bag = bag.predict(X_test)
    mse_bag = mean_squared_error(y_test,y_pred_bag)
    mses[k].append(mse_bag)

In [None]:
np.array(mses).shape

In [None]:
sns.lineplot(x=[1]+B, y=np.array(mses).mean(axis=0),marker='o')
#plt.ylim((16,28))
plt.xlabel('Trees')
plt.ylabel('$MSE_{Test}$')

In [None]:
print(np.array(mses).mean(axis=0))
print(np.array(mses).mean(axis=0).min())

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
%%time
B = [5,10,25,50,75,100,200,300,400,500]
mses_rfr=[]
cv = KFold(n_splits=10, shuffle=True, random_state=714)
# this is 10-fold CV
for k, (train_index, test_index) in enumerate(cv.split(X,y)):
  mses_rfr.append([])
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]      
  # this is running 10 different RF with different B
  dtr = DecisionTreeRegressor(random_state=714)
  dtr.fit(X_train,y_train)
  y_pred_dt = dtr.predict(X_test)
  mse_dt = mean_squared_error(y_test,y_pred_dt)
  mses_rfr[k].append(mse_dt)
  for b in B:                     
    rfr = RandomForestRegressor(n_estimators=b)
    rfr.fit(X_train, y_train)
    #plotting the results
    y_pred_rfr = rfr.predict(X_test)
    mse_rfr = mean_squared_error(y_test,y_pred_rfr)
    mses_rfr[k].append(mse_rfr)

In [None]:
sns.lineplot(x=[1]+B, y=np.array(mses_rfr).mean(axis=0),marker='o')
#plt.ylim((16,28))
plt.xlabel('Trees')
plt.ylabel('$MSE_{Test}$')

In [None]:
print(np.array(mses_rfr).mean(axis=0))
print(np.array(mses_rfr).mean(axis=0).min())

**Classification with RF**

In [None]:
X= pd.get_dummies(data=heart.drop(['AHD'],axis=1), columns=['ChestPain','Thal'])
y, levels = heart['AHD'].factorize()

print(levels)

In [None]:
%%time
B = [5,50,100,200,300,400,500]
accuracy_rfp=[]
accuracy_rfp2=[]
accuracy_rfsqrtp=[]
accuracy_rflogp=[]
cv = KFold(n_splits=10, shuffle=True, random_state=714)
# this is 10-fold CV
for k, (train_index, test_index) in enumerate(cv.split(X,y)):
  accuracy_rfp.append([])
  accuracy_rfp2.append([])
  accuracy_rfsqrtp.append([])
  accuracy_rflogp.append([])
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y[train_index], y[test_index]      
  # this is running 10 different RF with different max_features
  for b in B:                     
    rfp = RandomForestClassifier(n_estimators=b, max_features=None,random_state=714)
    rfp2 = RandomForestClassifier(n_estimators=b, max_features=0.5,random_state=714)
    rfsqrtp = RandomForestClassifier(n_estimators=b, max_features='sqrt',random_state=714)
    rflogp = RandomForestClassifier(n_estimators=b, max_features='log2',random_state=7)
    rfp.fit(X_train, y_train)
    rfp2.fit(X_train, y_train)
    rfsqrtp.fit(X_train, y_train)
    rflogp.fit(X_train, y_train)
    #plotting the results
    y_pred_rfp = rfp.predict(X_test)
    y_pred_rfp2 = rfp2.predict(X_test)
    y_pred_rfsqrtp = rfsqrtp.predict(X_test)
    y_pred_rflogp = rflogp.predict(X_test)
    acc_rfp = accuracy_score(y_test,y_pred_rfp)
    acc_rfp2 = accuracy_score(y_test,y_pred_rfp2)
    acc_rfsqrtp = accuracy_score(y_test,y_pred_rfsqrtp)
    acc_rflogp = accuracy_score(y_test,y_pred_rflogp)
    accuracy_rfp[k].append(acc_rfp)
    accuracy_rfp2[k].append(acc_rfp2)
    accuracy_rfsqrtp[k].append(acc_rfsqrtp)
    accuracy_rflogp[k].append(acc_rflogp)

In [None]:
np.array(accuracy_rflogp).shape

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(x=B, y=np.array(accuracy_rfp).mean(axis=0),marker='o')
sns.lineplot(x=B, y=np.array(accuracy_rfp2).mean(axis=0),marker='o')
sns.lineplot(x=B, y=np.array(accuracy_rfsqrtp).mean(axis=0),marker='o')
sns.lineplot(x=B, y=np.array(accuracy_rflogp).mean(axis=0),marker='o')
plt.legend(['$m=p$','$m=p/2$','$m=\sqrt{p}$','$m=log_2(p)$'])
#plt.ylim((16,28))
plt.xlabel('Number of Trees')
plt.ylabel('$Accuracy_{Test}$')

In [None]:

accuracy_dt = []
cv = KFold(n_splits=10, shuffle=True, random_state=714)
# this is 10-fold CV
for k, (train_index, test_index) in enumerate(cv.split(X,y)):  
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y[train_index], y[test_index]   
  clf = DecisionTreeClassifier(random_state=714)
  clf.fit(X_train,y_train)
  y_pred_dt = clf.predict(X_test)
  acc_dt = accuracy_score(y_test,y_pred_dt)
  accuracy_dt.append(acc_dt)

In [None]:
print(np.array(accuracy_dt))
print(np.array(accuracy_dt).mean())