# Decision Trees

- ***CART*** = Classification and Regression Tree

In [None]:
#Import Tree Classifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
#Import accuracy
from sklearn.metrics import accuracy_score

SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

#initialize decision tree model dt
dt = DecisionTreeClassifier(max_depth=2, criterion='gini', min_samples_leaf = 0.1, random_state = SEED) 
'''
max_depth = maximum depth of tree
criterion = criteria for information gain {“gini”, “entropy”, “log_loss”}, default=”gini”
min_samples_leaf = 0.1 means each leaf needs to have atleast 10% of training data. 

'''

dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

accuracy_score(y_test, y_pred)


## Ensemble Learning

- Train N different models with different algorithms from the same traning set and make a combined robust model

In [None]:
from sklearn.ensemble import VotingClassifier

#Initialize all models
lr = LogisticRegression(random_state = SEED)
knn = KNN()
dt = DecisionTreeClassifier(random_state = SEED)

#Define a list that has tuples: ("Classifier_name", Classifer)

classifiers = [('Logistic Regression',lr),
               ('KNearest Neighbors',knn),
               ('Classification Tree',dt)]

#Iterate over tuples

for clf_name,clf in classifiers:
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(clf_name,accuracy_score(y_test,y_pred))
    
#Voting Classifier

vc = VotingClassifier(esimators = classifiers, n_jobs=-1) #n-jobs = -1 will assign all CPUs cores to training this model. 



vc.fit(X_train,y_train)
y_pred = vc.predict(X_test)
print(accuracy_score(y_test,y_pred))

## Bagging
- Bootstrap data from same training set, make N different models with SAME algorithm and do voting to get better overall model
- Out of Bag scoring technique gives accuracy similar to cross validation but FASTER!

In [None]:
from sklearn.ensemble import BaggingClassifier

#initialize baseline model
dt = DecisionTreeClassifier(max_depth = 8,min_samples_leaf = 0.16, random_state = SEED)

#initialize bagging classifier
bc = BaggingClassifier(base_estimator= dt, n_estimators=300, oob_score= True,n_jobs=-1)

'''
n_estimators = number of models(trees here) to be made and used for bagging

oob_score = out of bag scoring technique
'''

bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)

test_accuracy = accuracy_score(y_test,y_pred)
oob_accuracy = bc.oob_score_                                            #OOB Score/Accuracy



## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_features = 'sqrt',n_estimators= 400, min_samples_leaf=0.12, random_state=SEED)
'''
max_features = maximum number of features.
                Default = 1
                Other options: {“sqrt”, “log2”, None}
'''

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

RMSE = MSE(y_test, y_pred)**0.5


######################### FEATURE IMPORTANCE############################

importances_rf = pd.Series(rf.feature_importances_,columns = X.columns)

sorted_importances_rf = importances_rf.sort_values()

sorted_importances_rf.plot(kind = 'barh', color='lightgreen')
plt.show()

NameError: name 'SEED' is not defined

## Boosting
- ***Adaboost***: Adaptive Boosting
-                     Each predictor pays more attention to instances wrongly predicted by its predecessor
-                     Changes weights of instances
-                     Each predictor is assigned alpha which depends on training error
-                     Eta coefficient - learning rate. 0 < Eta < 1
-                     Model 1 = M(X,y), Model 2 = M(W2,X,y), Model 3 = M(W3,X,y)... where alpha affects Weights
                    
#  
                    
- ***Gradient Boost (GB)***:  Uses CARTs           
-                     Does not change weights of instances. Each predictor is trained using residual errors of its precessor as labels.
-                     Residual r = y - y_hat
-                     Eta - learning rate here is defined as Shrinkage
-                     Model 1 = M(X,y), Model 2 = M(X, Etaxr1), Model 3 = M(X, Etaxr2)
-                     y_pred = y1 + Eta x r1 + Eta x r2 + ...
- ***Stochastic Gradient Boost (SGB)***: GB may lead to CARTs with same split points and features           
-                     Each CART is trained on random subset of data, sampled without replacement
-                     Features are also sampled without replacement to choose split points

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

#initialize baseline model
dt = DecisionTreeClassifier(max_depth = 1, random_state = SEED)

#initialize bagging classifier
adb_clf = AdaBoostClassifier(base_estimator= dt, n_estimators=100)

adb_clf.fit(X_train, y_train)
y_pred_proba = adb_clf.predict_proba(X_test)[:,1]                               #why predict_proba?? because roc_auc_score

adb_clf_roc_auc_score = roc_auc_score(y_test,y_pred_proba)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbt = GradientBoostingRegressor(n_estimators=300, max_depth = 1, random_state = SEED)
gbt.fit(X_train, y_train)
y_pred = gbt.predict(X_test)

RMSE = MSE(y_test, y_pred)**0.5

########################## Stochastic Gradient Boosting ##############################

gbt = GradientBoostingRegressor(n_estimators=300, 
                                subsample= 0.8,                             #random sampling of instances 80%
                                max_features= 0.2,                          #random sampling of features 20%
                                max_depth = 1, 
                                random_state = SEED)



NameError: name 'SEED' is not defined

## Hyperparameter Tuning

In [None]:
print(model.get_params()) # to see all the parameters of a model and decide which hyper parameter to tune

from sklearn.model_selection import GridSearchCV

# Define params_dt
params_dt = {'max_depth': [2,3,4],
              'min_samples_leaf': [0.12, 0.14, 0.16, 0.18]}

# Instantiate grid_dt
grid_dt = GridSeachCV(estimator=dt,
                       param_grid=params_dt,
                       scoring='roc_auc',
                       cv=5,
                       verbose = 1,                     #verbose means messages printed during fitting. Higher value = more messages
                       n_jobs=-1)

