In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
# Import Libraries

# DS 
import os
from itertools import islice
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
import seaborn as sns

# PLOTLY
import plotly.offline as po
import plotly.graph_objs as go

# SKLEARN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (GradientBoostingRegressor, 
                              GradientBoostingClassifier, 
                              AdaBoostClassifier,
                              AdaBoostRegressor,
                              RandomForestRegressor,
                              RandomForestClassifier)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, confusion_matrix, accuracy_score, plot_roc_curve
from sklearn.svm import SVC
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence

# Aesthetic Plots
%matplotlib inline

In [3]:
# Modules
from src.data_clean import *
from src.grid_search import *
from src.helpers import *

In [4]:
# GLOBAL VARS
FSIZE = (12, 8)

In [5]:
# Read and Clean CSV
churn = data_cleaner(pd.read_csv("data/churn_train.csv"))

In [6]:
# Create X, y arrays from dataframe
X = churn
y = churn.pop("target")

# Train Test Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Model Comparison

> Logistic Regression

In [15]:
#Fit the logistic Regression Model
logmodel = LogisticRegression(random_state = 50)
logmodel.fit(X_train,y_train)

#Predict the value for new, unseen data
pred = logmodel.predict(X_test)

# Find Accuracy using accuracy_score method
logmodel_accuracy = round(accuracy_score(y_test, pred) * 100, 2)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



> KNN Classifier

In [14]:
#Fit the K-Nearest Neighbor Model
knnmodel = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) #p=2 represents Euclidean distance, p=1 represents Manhattan Distance
knnmodel.fit(X_train, y_train) 

#Predict the value for new, unseen data
knn_pred = knnmodel.predict(X_test)

# Find Accuracy using accuracy_score method
knn_accuracy = round(accuracy_score(y_test, knn_pred) * 100, 2)

> Decision Tree Classifier

In [11]:
#Fit the Decision Tree Classification Model
dtmodel = DecisionTreeClassifier(criterion = "gini", random_state = 50)
dtmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
dt_pred = dtmodel.predict(X_test)

# Find Accuracy using accuracy_score method
dt_accuracy = round(accuracy_score(y_test, dt_pred) * 100, 2)

> Random Forest Classification

In [12]:
#Fit the Random Forest Classification Model
rfmodel = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
rfmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
rf_pred = rfmodel.predict(X_test)

# Find Accuracy using accuracy_score method
rf_accuracy = round(accuracy_score(y_test, rf_pred) * 100, 2)

> Gradient Boosted Classification

In [16]:
#Fit the Gradient Boosted Classification Model
gbmodel = GradientBoostingClassifier(random_state=50)
gbmodel.fit(X_train,y_train)

#Predict the value for new, unseen data
pred = gbmodel.predict(X_test)

# Find Accuracy using accuracy_score method
gbmodel_accuracy = round(accuracy_score(y_test, pred) * 100, 2)

> Gradient Boosted Classification (POST GRID)

```
'learning_rate': 0.1,
'max_depth': 6,
'max_features': 0.3,
'min_samples_leaf': 10,
'n_estimators': 100,
'random_state': 50
```

In [21]:
#Fit the Gradient Boosted Classification Model
gbmodel_grid = GradientBoostingClassifier(learning_rate=0.1,
                                     max_depth=6,
                                     max_features=0.3,
                                     min_samples_leaf=10,
                                     n_estimators=100,
                                     random_state=50)
gbmodel_grid.fit(X_train,y_train)

#Predict the value for new, unseen data
pred = gbmodel_grid.predict(X_test)

# Find Accuracy using accuracy_score method
gbmodel_grid_accuracy = round(accuracy_score(y_test, pred) * 100, 2)

In [17]:
# Compare Several models according to their Accuracies
Model_Comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-Nearest Neighbor', 
              'Decision Tree', 'Random Forest', 'Gradient Boosting', 'Gradient Boosting (POST GRID)'],
    'Score': [logmodel_accuracy, knn_accuracy, 
              dt_accuracy, rf_accuracy, gbmodel_accuracy, gbmodel_grid_accuracy]})
Model_Comparison_df = Model_Comparison.sort_values(by='Score', ascending=False)
Model_Comparison_df = Model_Comparison_df.set_index('Score')
Model_Comparison_df.reset_index()

Unnamed: 0,Score,Model
0,79.38,Gradient Boosting
1,76.42,Random Forest
2,74.48,K-Nearest Neighbor
3,72.5,Logistic Regression
4,71.41,Decision Tree
