# Assignment 3 - Churn Classification

In [1]:
# Import libraries and my knn class.
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn import preprocessing
from error_metrics import *
from knn import * 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# Load the data
data = pd.read_csv('churn_data.csv')
validation = pd.read_csv('churn_validation.csv')
del data['CustID']
del validation['CustID']
data.head()

Unnamed: 0,Gender,Age,Income,FamilySize,Education,Calls,Visits,Churn
0,Male,34,Lower,4,16,14,5,Yes
1,Male,20,Lower,5,14,49,1,No
2,Female,30,Lower,4,20,19,4,Yes
3,Female,46,Lower,4,14,15,4,Yes
4,Female,23,Lower,4,16,18,0,No


### What is the response variable, and what are the predictor variables?
The response variable is 'Churn'. The predictor varaibles are 'Gender', 'Age', 'Income', 'FamilySize', 'Education', 'Calls', and 'Visits'.
### What data transforms are necessary to perform on this data and why?
The column 'CustID' can be deleted because it does not make sense as a predictor variable in this case. Also, all of the predictor columns must be numerical to be able to perform a KNN algorithm so you have to one-hot encode the categorical columns. I scaled the x data so all of the columns would be on the same scale. Also I encoded the y data. 

## Getting the data ready:

In [2]:
# Select x and y data.
features = list(data)
features.remove('Churn')
data_x = data[features]
data_y = data['Churn']

In [3]:
# Convert class lables to numbers using label encoding.
le = preprocessing.LabelEncoder()
data_y = le.fit_transform(data_y)

# One-Hot Encode features (data_x).
def cat_features(dataframe):
    td = pd.DataFrame({'a':[1,2,3], 'b':[1.0, 2.0, 3.0]})
    return filter(lambda x: not(dataframe[x].dtype in [td['a'].dtype, td['b'].dtype]), list(dataframe))

data_x = pd.get_dummies(data_x, columns=list(cat_features(data_x)))

# Split into training and test sets.
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3)

In [4]:
# Scale data.
min_max_scaler = preprocessing.MinMaxScaler() # Default scaled range is [0, 1]

# Get the preprocessed training and test data
train_x_pp = min_max_scaler.fit_transform(x_train)
test_x_pp = min_max_scaler.transform(x_test)

  return self.partial_fit(X, y)


## Exploring Different Models:

### 1. KNN (***class created by me!***)

In [5]:
from scipy.spatial.distance import euclidean
knn = KNN(5, euclidean)
knn.fit(train_x_pp, y_train)
y_hat = knn.predict(test_x_pp)

In [6]:
# Compare labels.
y_test_labs = le.inverse_transform(y_test)
y_hat_labs = le.inverse_transform(y_hat)
print('(Actual, Predicted): \n'+str(list(zip(y_test_labs, y_hat_labs))))

(Actual, Predicted): 
[('Yes', 'No'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'Yes'), ('No', 'Yes'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('No', 'Yes'), ('Yes', 'No'), ('No', 'Yes'), ('Yes', 'Yes'), ('No', 'Yes'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'No'), ('Yes', 'Yes'), ('No', 'Yes'), ('No', 'No'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'No'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('No', 'No'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('No', 'Yes'), ('No', 'No'), ('Yes', 'Yes')]


In [7]:
# Evaluate results at k.
print('--------- EVALUATING MODEL: k = '+str(knn.k)+ '-----------')
print('Accuracy: '+str(accuracy_score(y_test, y_hat)))
print('Precision: '+str(precision_score(y_test, y_hat)))
print('Recall: '+str(recall_score(y_test, y_hat)))
print('F1: '+str(f1_score(y_test, y_hat)))
print('ROC AUC: '+str(roc_auc_score(y_test, y_hat)))
print('Confusion Matrix: \n'+str(confusion_matrix(y_test, y_hat)))

--------- EVALUATING MODEL: k = 5-----------
Accuracy: 0.7435897435897436
Precision: 0.76
Recall: 0.8260869565217391
F1: 0.7916666666666667
ROC AUC: 0.7255434782608695
Confusion Matrix: 
[[10  6]
 [ 4 19]]


### 2. Naive Bayes

In [8]:
# Build and evaluate the model.
from sklearn import naive_bayes

nb_mod = naive_bayes.GaussianNB()
nb_mod.fit(train_x_pp, y_train)
y_hat = nb_mod.predict(test_x_pp)
print_multiclass_classif_error_report(y_test, y_hat)

Accuracy: 0.717948717948718
Avg. F1 (Micro): 0.717948717948718
Avg. F1 (Macro): 0.6980999296270232
Avg. F1 (Weighted): 0.7119940814522095
Confusion Matrix: 
[[ 9  7]
 [ 4 19]]
              precision    recall  f1-score   support

           0       0.56      0.69      0.62        13
           1       0.83      0.73      0.78        26

   micro avg       0.72      0.72      0.72        39
   macro avg       0.69      0.71      0.70        39
weighted avg       0.74      0.72      0.72        39



### 3. Decision Trees

In [9]:
from sklearn import tree

# Information Entropy approach.
dtree_ent = tree.DecisionTreeClassifier(criterion='entropy')
dtree_ent.fit(train_x_pp, y_train)
y_hat_ent = dtree_ent.predict(test_x_pp)
print_multiclass_classif_error_report(y_test, y_hat_ent)

Accuracy: 0.7948717948717948
Avg. F1 (Micro): 0.7948717948717948
Avg. F1 (Macro): 0.7771428571428571
Avg. F1 (Weighted): 0.7884249084249085
Confusion Matrix: 
[[10  6]
 [ 2 21]]
              precision    recall  f1-score   support

           0       0.62      0.83      0.71        12
           1       0.91      0.78      0.84        27

   micro avg       0.79      0.79      0.79        39
   macro avg       0.77      0.81      0.78        39
weighted avg       0.82      0.79      0.80        39



### 4. Random Forest 

In [10]:
# Build a sequence of Random Forest models for different n_est and depth values.
from sklearn import ensemble

n_ests = [50]
depths = [6]
for n in n_ests:
    for dp in depths:
        rf = ensemble.RandomForestClassifier(n_estimators=n, max_depth=dp)
        rf.fit(x_train, y_train)
        y_hat = rf.predict(x_test)
        print('--------EVALUATING MODEL: n_estimators = '+str(n)+', max_depth = '+str(dp)+'--------')
        print_multiclass_classif_error_report(y_test, y_hat)

--------EVALUATING MODEL: n_estimators = 50, max_depth = 6--------
Accuracy: 0.8205128205128205
Avg. F1 (Micro): 0.8205128205128205
Avg. F1 (Macro): 0.8078817733990148
Avg. F1 (Weighted): 0.8167235063786789
Confusion Matrix: 
[[11  5]
 [ 2 21]]
              precision    recall  f1-score   support

           0       0.69      0.85      0.76        13
           1       0.91      0.81      0.86        26

   micro avg       0.82      0.82      0.82        39
   macro avg       0.80      0.83      0.81        39
weighted avg       0.84      0.82      0.82        39



### 5. Support Vector Machine (SVM)

In [11]:
# Make a sequence of SVM classifiers for different values of error term c. **Note: c=1.0 is default.
from sklearn import svm

cs = [1.5, 2.0, 2.5]
for c in cs:
    # Create model and fit
    mod = svm.SVC(C=c)
    mod.fit(train_x_pp, y_train)
    
    # Make predictions
    y_hat = mod.predict(test_x_pp)
    print('--------- EVALUATING MODEL: C = ' + str(c) + ' ------------')
    print_multiclass_classif_error_report(y_test, y_hat)

--------- EVALUATING MODEL: C = 1.5 ------------
Accuracy: 0.6153846153846154
Avg. F1 (Micro): 0.6153846153846154
Avg. F1 (Macro): 0.5883180858550316
Avg. F1 (Weighted): 0.6072646565257401
Confusion Matrix: 
[[ 7  9]
 [ 6 17]]
              precision    recall  f1-score   support

           0       0.44      0.54      0.48        13
           1       0.74      0.65      0.69        26

   micro avg       0.62      0.62      0.62        39
   macro avg       0.59      0.60      0.59        39
weighted avg       0.64      0.62      0.62        39

--------- EVALUATING MODEL: C = 2.0 ------------
Accuracy: 0.6410256410256411
Avg. F1 (Micro): 0.6410256410256411
Avg. F1 (Macro): 0.61
Avg. F1 (Weighted): 0.6297435897435897
Confusion Matrix: 
[[ 7  9]
 [ 5 18]]
              precision    recall  f1-score   support

           0       0.44      0.58      0.50        12
           1       0.78      0.67      0.72        27

   micro avg       0.64      0.64      0.64        39
   macro avg   



## Evaluating the Models
What modeling approaches did you use and why? Describe your model development process, including the different models tried, feature selection methods, and the different transformation techniques you employed.
Which error metrics did you use to assess performance and why? What kind of performance did you obtain on the different models you built?

I tried the KNN class that I created, Naive Bayes, Decision Tree, Random Forest, and SVM. I mainly assessed the accuracy score and confusion matrix when I compared the different models. They all seemed to perform pretty well but I chose to develop the Random Forest model further because it had the best results on its own. It had an accuracy score of 0.8205128205128205 with 50 estimators and a max depth of 6. I was very happy with the KNN class that I created because it had an accuracy score of 0.7435897435897436 and a precision score of 0.7142857142857143. I thought that it would be a lot worse because the algorithm I created was not very complex.

## Validation Set:
Load the dataset “churn_validation.csv” into a new data frame and recode as necessary. Predict the outcomes for each of the customers and compare to the actual. What are the error rates you get based on your selected metrics?

In [12]:
# Transform validation set.

# Select x and y data.
features = list(validation)
features.remove('Churn')
val_x = validation[features]
val_y = validation['Churn']

# Convert class lables to numbers using label encoding.
le = preprocessing.LabelEncoder()
val_y = le.fit_transform(val_y)

# One-Hot Encode features (val_x).
val_x = pd.get_dummies(val_x, columns=list(cat_features(val_x)))

val_x.head()

Unnamed: 0,Age,FamilySize,Education,Calls,Visits,Gender_Female,Gender_Male,Income_Lower,Income_Upper
0,54,4,18,48,3,0,1,0,1
1,21,4,19,44,2,0,1,1,0
2,22,3,16,22,5,1,0,1,0
3,27,3,13,19,2,0,1,0,1
4,18,2,14,6,3,0,1,1,0


In [13]:
# Run validation set on Random Forest Model. 
y_pred = rf.predict(val_x)
print_multiclass_classif_error_report(y_pred, val_y)

Accuracy: 0.65625
Avg. F1 (Micro): 0.65625
Avg. F1 (Macro): 0.6559139784946237
Avg. F1 (Weighted): 0.6545698924731183
Confusion Matrix: 
[[11  3]
 [ 8 10]]
              precision    recall  f1-score   support

           0       0.79      0.58      0.67        19
           1       0.56      0.77      0.65        13

   micro avg       0.66      0.66      0.66        32
   macro avg       0.67      0.67      0.66        32
weighted avg       0.69      0.66      0.66        32



In [14]:
# Compare labels.
y_labels = le.inverse_transform(val_y)
y_pred_labs = le.inverse_transform(y_pred)
print('(Actual, Predicted): \n'+str(list(zip(y_labels, y_pred_labs))))

(Actual, Predicted): 
[('Yes', 'Yes'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'No'), ('Yes', 'Yes'), ('No', 'No'), ('No', 'No'), ('No', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('Yes', 'Yes'), ('No', 'Yes'), ('Yes', 'No'), ('No', 'No'), ('No', 'Yes'), ('No', 'No'), ('No', 'No'), ('No', 'No'), ('No', 'Yes'), ('No', 'Yes'), ('Yes', 'Yes'), ('No', 'Yes'), ('No', 'Yes'), ('Yes', 'Yes'), ('No', 'No'), ('No', 'No'), ('Yes', 'No'), ('No', 'Yes')]


### This model has an accuracy score of 0.65625 on the validation set. This isn't bad but it is not the best. 