# Decision Tree

# Imports

Necessary imports 

In [None]:
# Imports necessary imports
import numpy as np
import pandas as pd
import imblearn
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTETomek
from sklearn.metrics import confusion_matrix, classification_report

# Get dataset and feature selecting

In [None]:
# Get dataset
df = pd.read_csv('telecom_churn.csv')

# Remove the features MonthlyCharge and DataUsage from dataset
df_features = df.drop(["MonthlyCharge", "DataUsage"], axis=1)

# Split dataframe into two dataframes

In [None]:
# Split datafram into a datafram with only the feature Churn and one dataframe with all other features except Churn
y = df_features.Churn
X = df_features.drop("Churn", axis=1)

# Part 1

### Split in training set and test set
Splits the data into three parts; Train and test set. Training set for training and test set to train the data

In [None]:
# Create training set and test set from X and y, with a 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=5)

### Classifier
Trains the classifier

In [None]:
# Create decision treee classifer
dtc = DecisionTreeClassifier(random_state=5)

# Train classifer with training set
dtc.fit(X_train, y_train)

#### Classification report and confusion matrix

In [None]:
print(classification_report(y_test, dtc.predict(X_test)))

In [None]:
# The code below is taken from
# https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea
# with minor changes
def create_confusion_matrix(dtc, y_test, X_test, path):
    confusion_tree=confusion_matrix(y_test, dtc.predict(X_test))
    print("\nconfusion matrix for tree(max_depth 4) after upsamling:\n{}".format(confusion_tree))

    group_names = ["True Negative","False Positive","False Negative","True Positive"]
    group_counts = ["{0:0.0f}".format(value) for value in
                    confusion_tree.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         confusion_tree.flatten()/np.sum(confusion_tree)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
              zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns_plot = sns.heatmap(confusion_tree, annot=labels, fmt="", cmap='Blues')
    sns_plot.set_title("Confusion matrix")
    sns_plot.set_xlabel("Predicted")
    sns_plot.set_ylabel("Actual")
    sns_plot.figure.savefig(path)

In [None]:
create_confusion_matrix(dtc, y_test, X_test, "images/dt/confusion_matrix_dt_original.jpg")

### Tuning Hyperparameters with KFold and GridSearchCV

In [None]:
# KFold cross-validation

cv_method = KFold(n_splits=10, shuffle=True, random_state=5)

In [None]:
# This code is taken from
# https://www.featureranking.com/tutorials/machine-learning-tutorials/sk-part-3-cross-validation-and-hyperparameter-tuning/
# With minor changes
def tuning_hyperparameters(cv_method=cv_method):
    tune_tree = DecisionTreeClassifier(random_state=5)

    dt_parameters = {'max_depth': range(1,11),
                   'max_features': range(4,19)}

    dt_gridSearch = GridSearchCV(tune_tree, dt_parameters,
    cv=cv_method, n_jobs=-1,
    verbose=True)

    dt_gridSearch.fit(X_train, y_train);
    return dt_gridSearch

####  Tuning hyperparameters for classifer

In [None]:
# GrdiSearch for tuning hyperparameters

gridSearch_tuned = tuning_hyperparameters()
best_tuned_clf = gridSearch_tuned.best_estimator_
gridSearch_tuned.best_estimator_

In [None]:
# Print classification report

print(classification_report(y_test, best_tuned_clf.predict(X_test)))

In [None]:
# Visualize confusion matrix

create_confusion_matrix(best_tuned_clf, y_test, X_test, "images/dt/confusion_matrix_dt_original_tuned.jpg")

# Part 2

### SMOTETomek Resampling

In [None]:
# Methods for resampling
def resample_SMOTE(X_train, y_train, random_state):
    smt = SMOTETomek(sampling_strategy=1.0, random_state=random_state)
    X_smt, y_smt = smt.fit_sample(X, y)

    return X_smt, y_smt

### Split in training set and test set and resample with SMOTETomek
Splits the data into three parts; Train and test set. Training set for training and test set to train the data. Resampling the training set with SMOTETomek.

In [None]:
# Create training set and test set from X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=5)

# Create resampled training set
X_train, y_train = resample_SMOTE(X_train, y_train, 5)

### Classifier
Trains the classifier

In [None]:
# Declare decision tree classifer
dtc = DecisionTreeClassifier(random_state=5)

# Train classifer with training set
dtc.fit(X_train, y_train)

### Classification report and confusion matrix

In [None]:
# Print classification report

print(classification_report(y_test, dtc.predict(X_test)))

In [None]:
# Confusion matrix for resampled dataset

create_confusion_matrix(dtc, y_test, X_test, "images/dt/confusion_matrix_dt.jpg")

### Tunining Hyperparameters with resampled dataset, KFold and GridSearchCV

In [None]:
# KFold cross_validation 

cv_method = KFold(n_splits=10, shuffle=True, random_state=5)

#### Tuning hyperparameters

In [None]:
# GrdiSearch for tuning hyperparameters

gridSearch_tuned = tuning_hyperparameters()
best_tuned_clf = gridSearch_tuned.best_estimator_
gridSearch_tuned.best_estimator_

#### Classification report and confusion matrix

In [None]:
# Print classification report

print(classification_report(y_test, best_tuned_clf.predict(X_test)))

In [None]:
create_confusion_matrix(best_tuned_clf, y_test, X_test, "images/dt/confusion_matrix_dt_tuned.jpg")