# Advanced Classification - Ensemble Methods - Exercises with Answers

## Exercise 1

#### Task 1 
##### Import the required packages to perform random forest and ensemble methods.
##### Set the working directory to data directory.

##### Print the working directory.

#### Result:


In [None]:
#Helper packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from textwrap import wrap
import pickle

# Model set up and tuning packages from scikit-learn.
from sklearn.model_selection import train_test_split

# Scikit-learn package for data preprocessing.
from sklearn import preprocessing

# Scikit-learn packages for evaluating model performance.
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Random forest and boosting packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `advanced-classification` folder.
main_dir = home_dir / "Desktop" / "advanced-classification"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"

In [None]:
# Change the working directory.
os.chdir(data_dir)

# Check the working directory.
print(os.getcwd())

#### Task 2
##### Load the dataset `bank_marketing.csv` and save it to `bank_marketing`.
##### Print the first few rows of `bank_marketing`.
##### Print the frequecy of the target variable `y`.

#### Result:

In [None]:
bank_marketing = pd.read_csv("bank_marketing.csv")
bank_marketing.head()

In [None]:
print(bank_marketing['y'].value_counts())

#### Task 3
##### Check for NA's `bank_marketing` and print the count of NA's in each column. If the missing value is from a integer type column, fill it with the mean value of the column.
##### Now, print the count of NA's in each column.

In [None]:
print(bank_marketing.isnull().sum())

In [None]:
bank_marketing = bank_marketing.fillna(bank_marketing.mean()['pdays'])

In [None]:
print(bank_marketing.isnull().sum())

#### Task 4
##### Print the datatypes of all the columns and convert the categorical columns to dummy variables.
##### The datatype of target variable needs to be binary. Check for the datatype of the target variable and convert it to binary.

In [None]:
bank_marketing.dtypes

In [None]:
bank_marketing = pd.get_dummies(bank_marketing, columns=['job', 'marital','education','default','housing','loan','contact','month','day_of_week','poutcome'])

In [None]:
bank_marketing["y"] = np.where(bank_marketing["y"] == "yes", True, False)

# Check class again.
print(bank_marketing.y.dtypes)

## Exercise 2

#### Task 1
##### Select the predictors by dropping variable `y` and save the result to a dataframe `X_ex`.
##### Save the target variable `y` column to `y_ex` variable.
##### Set seed as 1.
##### Split the data into train and test sets and save respective variables to `X_train_ex`, `X_test_ex`, `y_train_ex`, `y_test_ex`.

#### Result:

In [None]:
# Select the predictors and target.
X_ex = bank_marketing.drop(['y'], axis = 1)
y_ex = np.array(bank_marketing['y'])

# Set the seed to 1.
np.random.seed(1)

# Split into training and test sets.
X_train_ex, X_test_ex, y_train_ex, y_test_ex = train_test_split(X_ex, y_ex, test_size = 0.3)

#### Task 2
##### Create a random forest classifier and save it to `forest_ex` variable.
##### Set random state to 1, number of estimators to 100 and `gini` as the criterion in the model.
##### Fit the classifier to our training data.

#### Result:

In [None]:
forest_ex = RandomForestClassifier(criterion = 'gini', n_estimators = 100, 
                                random_state = 1)

In [None]:
forest_ex.fit(X_train_ex, y_train_ex)

#### Task 3
##### Predict on the test data and print the first 5 predictions on the test data.

#### Result:

In [None]:
# Predict on test data.
y_predict_forest_ex = forest_ex.predict(X_test_ex)

# Look at the first few predictions.
print(y_predict_forest_ex[0:5, ])

#### Task 4
##### Print the confusion matrix and save it as `conf_matrix_forest_ex`.
##### Print the accuracy score as `accuracy_forest_ex`.
##### Compute the accuracy on the training data.

#### Result:

In [None]:
# Take a look at test data confusion matrix.
conf_matrix_forest_ex = metrics.confusion_matrix(y_test_ex, y_predict_forest_ex)
print(conf_matrix_forest_ex)

In [None]:
accuracy_forest_ex = metrics.accuracy_score(y_test_ex, y_predict_forest_ex)
print("Accuracy for random forest on test data: ", accuracy_forest_ex)

In [None]:
# Compute accuracy using training data.
acc_train_forest_ex = forest_ex.score(X_train_ex, y_train_ex)

print ("Train Accuracy:", acc_train_forest_ex)

#### Task 5
##### Load the pickle `ex_model_final_logistic.sav` and save it as `ex_model_final_tree`.
##### Append the test accuracy `accuracy_forest_ex` to the dataframe `ex_model_final_tree`.
##### Print the results.

In [None]:
ex_model_final_tree = pickle.load(open("ex_model_final_logistic.sav","rb"))

# Create a dictionary with accuracy values for our knn model with k = 5.
ex_model_final_tree = ex_model_final_tree.append({'metrics': "accuracy",
               'values':round(accuracy_forest_ex,4),
                'model':'random_forest'},
                ignore_index = True)
                           
print(ex_model_final_tree)

#### Task 6
##### Print the feature importance graph and print the top 10 important features in our random forest model.

#### Result:

In [None]:
bank_marketing_features = bank_marketing.drop('y', axis = 1)

features_ex = bank_marketing_features.columns
importances_ex = forest_ex.feature_importances_
indices_ex = np.argsort(importances_ex)[::-1]
top_indices_ex = indices_ex[0:10][::-1]

plt.figure(1)
plt.title('Feature Importance of Bank marketing dataset')
plt.barh(range(len(top_indices_ex)), importances_ex[top_indices_ex], color = 'b', align = 'center')
labels = features_ex[top_indices_ex]
labels = [ '\n'.join(wrap(l,13)) for l in labels ]
plt.yticks(range(len(top_indices_ex)), labels)
plt.xlabel('Relative Importance')

## Exercise 3

#### Task 1

##### Create a gradient boosting classifier as `gbm_ex` with number of estimators set to 100, learning rate set to 1, max depth set to 1, and random state set to 1.
##### Fit the model to our training data.

#### Result:

In [None]:
# Save the parameters we will be using for our gradient boosting classifier.
gbm_ex = GradientBoostingClassifier(n_estimators = 100, 
                                    learning_rate = 1, 
                                    max_depth = 1,  
                                    random_state = 1)

In [None]:
# Fit the saved model to your training data.
gbm_ex.fit(X_train_ex, y_train_ex)

#### Task 2
##### Predict on the test data using our gbm classifier.
##### Print the first 5 predicted values.

#### Result:

In [None]:
# Predict on test data.
predicted_values_gbm_ex = gbm_ex.predict(X_test_ex)
print(predicted_values_gbm_ex[0:5,])

#### Task 3
##### Print the confusion matrix and accuracy score on the test data.
##### Print the training accuracy of gbm model.

#### Result:

In [None]:
# Take a look at test data confusion matrix.
conf_matrix_boosting_ex = metrics.confusion_matrix(y_test_ex, predicted_values_gbm_ex)
print(conf_matrix_boosting_ex)

In [None]:
# Compute test model accuracy score.
accuracy_gbm_ex = metrics.accuracy_score(y_test_ex, predicted_values_gbm_ex)
print('Accuracy of gbm on test data: ', accuracy_gbm_ex)

In [None]:
# Compute accuracy using training data.
train_accuracy_gbm_ex = gbm_ex.score(X_train_ex, y_train_ex)

print ("Train Accuracy:", train_accuracy_gbm_ex)

#### Task 4
##### Print the feature importance graph and print top 10 important predictors.
#### Result:

In [None]:
bank_marketing_features = bank_marketing.drop('y', axis = 1)

features_ex = bank_marketing_features.columns
importances_ex = gbm_ex.feature_importances_
indices_ex = np.argsort(importances_ex)[::-1]
top_indices_ex = indices_ex[0:10][::-1]

plt.figure(1)
plt.title('Feature Importance of Bank marketing dataset')
plt.barh(range(len(top_indices_ex)), importances_ex[top_indices_ex], color = 'b', align = 'center')
labels = features_ex[top_indices_ex]
labels = [ '\n'.join(wrap(l,13)) for l in labels ]
plt.yticks(range(len(top_indices_ex)), labels)
plt.xlabel('Relative Importance')

#### Task 5 

##### Append the test accuracy `accuracy_gbm_ex` to the dataframe `ex_model_final_tree`.
##### Print the results.
##### Remember to pickle the dataframe as `ex_model_tree.sav` .

#### Result:

In [None]:
# Add the gbm model to our dataframe.
ex_model_final_tree = ex_model_final_tree.append({'metrics' : "accuracy" , 
                                                'values' : round(accuracy_gbm_ex,4),
                                                'model':'gbm' } , 
                                                ignore_index = True)
print(ex_model_final_tree)

In [None]:
pickle.dump(ex_model_final_tree, open("ex_model_final_tree.sav", "wb"))