Output is cleared due to confidentiality of the information.

## Import Libraries

In [None]:
# Import Necessary Libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas_profiling
import plotly.offline as po
import plotly.graph_objs as go
%matplotlib inline

In [None]:
from jupyterthemes import jtplot
jtplot.style()

## Explore Data

In [None]:
#Perform Exploratory Data Analysis in just one line of code
pandas_profiling.ProfileReport(pd.read_excel(r'C:\Users\luc57.DESKTOP-NB5DC80\AE\excel\cust_data(gender).xlsx',index_col=0))

In [None]:
#Import Customer Churn Dataset
churn_dataset = pd.read_excel(r'C:\Users\luc57.DESKTOP-NB5DC80\AE\excel\cust_data(gender).xlsx',index_col=0)

In [None]:
# Number of Columns and Rows in the Dataset
churn_dataset.shape

In [None]:
churn_dataset.head()

## Data Wrangling

In [None]:
churn_dataset = churn_dataset.loc[churn_dataset['category']!='New Purchase']
len(churn_dataset)

In [None]:
churn_dataset.rename(columns={'category':'Churn'},inplace=True)

In [None]:
# Convert String values (Churned and Survived) of Churn column to 1 and 0
churn_dataset.loc[churn_dataset.Churn=='Churned','Churn'] = 1 
churn_dataset.loc[churn_dataset.Churn=='Survived','Churn'] = 0 

In [None]:
# Convert String values (Male and Female) of gender column to 1 and 0
churn_dataset.loc[churn_dataset.gender=='Male','gender'] = 1 
churn_dataset.loc[churn_dataset.gender=='Female','gender'] = 0 

In [None]:
#drop columns not useful for churn prediction
churn_dataset.drop(columns=['postal_code','earliest_transaction_date',
                           'last_transaction_date'],inplace=True)

In [None]:
churn_dataset.columns

In [None]:
churn_dataset.head()

In [None]:
churn_dataset.isnull().sum()

In [None]:
churn_dataset.order_count.unique()

In [None]:
churn_dataset[['order_count','purchase_quantity','discount_code_used']].hist()

In [None]:
churn_dataset.referrer_channel.value_counts()[:10]

In [None]:
#if referrer channel not in top 10, replace with others
top_10_referrer_channel = churn_dataset.referrer_channel.value_counts()[:10].index.tolist()
churn_dataset.loc[~churn_dataset['referrer_channel'].isin(top_10_referrer_channel), 'referrer_channel']='Others'

In [None]:
top_10_cities = churn_dataset.city.value_counts()[:10].index.tolist()
churn_dataset.loc[~churn_dataset['city'].isin(top_10_cities), 'city']='Others'

In [None]:
churn_dataset.country.value_counts()

In [None]:
top_9_countries = churn_dataset.country.value_counts()[:9].index.tolist()
churn_dataset.loc[~churn_dataset['country'].isin(top_9_countries), 'country']='Others'

In [None]:
churn_dataset.country.value_counts()

In [None]:
churn_dataset.dtypes

In [None]:
churn_dataset['Churn'].value_counts()

In [None]:
churn_dataset['gender'] = churn_dataset['gender'].astype(int)
churn_dataset['Churn'] = churn_dataset['Churn'].astype(int)
churn_dataset.dtypes

In [None]:
churn_dataset["Churn"].value_counts().values

In [None]:
# Visualize Total Customer Churn
plot_by_churn_labels = churn_dataset["Churn"].value_counts().keys().tolist()
plot_by_churn_values = churn_dataset["Churn"].value_counts().values.tolist()

In [None]:
print(plot_by_churn_labels)
print(plot_by_churn_values)

In [None]:
churn_dataset.Churn.value_counts(normalize=True) #if we leave out the new purchase, 87% churn

In [None]:
churn_dataset

# Exploratory Data Analysis

In [None]:
plot_data= [
    go.Pie(labels = plot_by_churn_labels,
           values = plot_by_churn_values,
           marker = dict(colors =  [ 'Teal' ,'Grey'],
                         line = dict(color = "white",
                                     width =  1.5)),
           rotation = 90,
           hoverinfo = "label+value+text",
           hole = .6)
]
plot_layout = go.Layout(dict(title = "Customer Churn",
                   plot_bgcolor  = "rgb(243,243,243)",
                   paper_bgcolor = "rgb(243,243,243)",))


fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)

In [None]:
# Visualize Churn Rate by Gender
plot_by_gender = churn_dataset.groupby('gender').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=['Female','Male'],
        #x=plot_by_gender['gender'],
        y=plot_by_gender['Churn'],
        width = [0.3, 0.3],
        marker=dict(
        color=['orange', 'green'])
    )
]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Churn Rate by Gender',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)


In [None]:
# Visualize Churn Rate by Referrer Channel
plot_by_techsupport = churn_dataset.groupby('referrer_channel').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_techsupport['referrer_channel'],
        y=plot_by_techsupport['Churn'],
        width = [0.3, 0.3, 0.3],
        marker=dict(
        color=['orange', 'green', 'teal','aquamarine','purple','blanchedalmond',
               'darkgreen','cornflowerblue','indigo','lightcyan','sandybrown'])
    )
]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Churn Rate by Referrer Channel',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)

In [None]:
# Visualize Churn Rate by Last Purchase in Days
plot_by_internet_service = churn_dataset.groupby('last_purchase_in_days').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_internet_service['last_purchase_in_days'],
        y=plot_by_internet_service['Churn'],
        width = [0.3, 0.3, 0.3],
        marker=dict(
        color=['orange', 'green', 'teal'])
    )
]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Churn Rate by Last Purchase in Days',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)


In [None]:
# Visualize Churn Rate by Order Count
plot_by_order_count = churn_dataset.groupby('order_count').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_order_count['order_count'],
        y=plot_by_order_count['Churn'],
        width = [0.3, 0.3,0.3,0.3],
        marker=dict(
        color=['orange', 'green','teal','magenta'])
    )
]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Churn Rate by Order Count',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)


In [None]:
# Visualize Churn Rate by purchase_quantity Duration
plot_by_purchase_quantity = churn_dataset.groupby('purchase_quantity').Churn.mean().reset_index()
plot_data = [
    go.Bar(
        x=plot_by_purchase_quantity['purchase_quantity'],
        y=plot_by_purchase_quantity['Churn'],
        width = [0.3, 0.3,0.3],
        marker=dict(
        color=['orange', 'green','teal'])
    )
]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Churn Rate by purchase_quantity Duration',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)


In [None]:
# Visualize Relation between Number of Discount Code Used
plot_by_discount_code_used = churn_dataset.groupby('discount_code_used').Churn.mean().reset_index()
plot_data = [
    go.Scatter(
        x=plot_by_discount_code_used['discount_code_used'],
        y=plot_by_discount_code_used['Churn'],
        mode='markers',
        name='Low',
        marker= dict(size= 5,
            line= dict(width=0.8),
            color= 'green'
           ),
    )
]
plot_layout = go.Layout(
        yaxis= {'title': "Churn Rate"},
        xaxis= {'title': "discount_code_used"},
        title='Relation between discount_code_used & Churn Rate',
        plot_bgcolor  = "rgb(243,243,243)",
        paper_bgcolor  = "rgb(243,243,243)",
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)


In [None]:
# Visualize Relation between AOV and Churn
plot_by_average_order_value = churn_dataset.groupby('average_order_value').Churn.mean().reset_index()
plot_data = [
    go.Scatter(
        x=plot_by_average_order_value['average_order_value'],
        y=plot_by_average_order_value['Churn'],
        mode='markers',
        name='Low',
        marker= dict(size= 5,
            line= dict(width=0.8),
            color= 'green'
           ),
    )
]
plot_layout = go.Layout(
        yaxis= {'title': "Churn Rate"},
        xaxis= {'title': "average_order_value"},
        title='Relation between average_order_value & Churn Rate',
        plot_bgcolor  = "rgb(243,243,243)",
        paper_bgcolor  = "rgb(243,243,243)",
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)


In [None]:
# Visualize Relation between AOV and Churn
plot_by_net_sales = churn_dataset.groupby('net_sales').Churn.mean().reset_index()
plot_data = [
    go.Scatter(
        x=plot_by_net_sales['net_sales'],
        y=plot_by_net_sales['Churn'],
        mode='markers',
        name='Low',
        marker= dict(size= 5,
            line= dict(width=0.8),
            color= 'green'
           ),
    )
]
plot_layout = go.Layout(
        yaxis= {'title': "Churn Rate"},
        xaxis= {'title': "net_sales"},
        title='Relation between net_sales & Churn Rate',
        plot_bgcolor  = "rgb(243,243,243)",
        paper_bgcolor  = "rgb(243,243,243)",
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
po.iplot(fig)


The customers who spend over 200€ in net sales are less likely to churn.

# Machine Learning on Churn Dataset

## Import libraries

In [None]:
# Machine Learning classification model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
churn_dataset.dtypes

In [None]:
#Perform One Hot Encoding using get_dummies method
churn_dataset = pd.get_dummies(churn_dataset, columns = ['referrer_channel','country','city'],
                              drop_first=True)

In [None]:
print(len(churn_dataset.columns))

In [None]:
#Perform Feature Scaling and One Hot Encoding
from sklearn.preprocessing import StandardScaler

#Perform Feature Scaling on numerical columns with large range in order to bring them on same scale
standardScaler = StandardScaler()
columns_for_ft_scaling = ['last_purchase_in_days','order_count','discount_code_used',
                          'average_order_value','net_sales']

#Apply the feature scaling operation on dataset using fit_transform() method
churn_dataset[columns_for_ft_scaling] = standardScaler.fit_transform(churn_dataset[columns_for_ft_scaling])


In [None]:
# See subset of values
churn_dataset.head(5)

In [None]:
#Number of columns increased and have suffixes attached, as a result of get_dummies method.
churn_dataset.columns

## Create features

As the column churn is made based on last purchase in days, we need to drop it. Otherwise, feature importance will show it as the most predictive feature.

In [None]:
#Create Feature variable X and Target variable y
y = churn_dataset['Churn']
X = churn_dataset.drop(['Churn','customer_name','last_purchase_in_days'], axis = 1)

## Split data into training and test set

In [None]:
#Split the data into training set (80%) and test set (20%)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 77)

# Logistic Regression Model

In [None]:
#Fit the logistic Regression Model
logmodel = LogisticRegression(random_state=77)
logmodel.fit(X_train,y_train)

#Predict the value for new, unseen data
pred = logmodel.predict(X_test)

# Find Accuracy using accuracy_score method
logmodel_accuracy = round(metrics.recall_score(y_test, pred) * 100, 2)
print(logmodel_accuracy)

In [None]:
#Generate confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat_logmodel = confusion_matrix(y_test,pred)
conf_mat_logmodel

# SVM Model

In [None]:
#Fit the Support Vector Machine Model
svcmodel = SVC(kernel='linear', random_state=77, probability=True)
svcmodel.fit(X_train,y_train)

#Predict the value for new, unseen data
svc_pred = svcmodel.predict(X_test)

# Find Accuracy using accuracy_score method
svc_accuracy = round(metrics.recall_score(y_test, svc_pred) * 100, 2)
print(svc_accuracy)

In [None]:
#Generate confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat_logmodel = confusion_matrix(y_test,pred)
conf_mat_logmodel

# KNN Model

In [None]:
#Fit the K-Nearest Neighbor Model
from sklearn.neighbors import KNeighborsClassifier
knnmodel = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) #p=2 represents Euclidean distance, p=1 represents Manhattan Distance
knnmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
knn_pred = knnmodel.predict(X_test)

# Find Accuracy using accuracy_score method
knn_accuracy = round(metrics.recall_score(y_test, knn_pred) * 100, 2)
print(knn_accuracy)

In [None]:
#Generate confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat_logmodel = confusion_matrix(y_test,pred)
conf_mat_logmodel

# Decision Tree

In [None]:
#Fit the Decision Tree Classification Model
from sklearn.tree import DecisionTreeClassifier
dtmodel = DecisionTreeClassifier(criterion = "gini", random_state = 50)
dtmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
dt_pred = dtmodel.predict(X_test)

# Find Accuracy using accuracy_score method
dt_accuracy = round(metrics.recall_score(y_test, dt_pred) * 100, 2)
print(dt_accuracy)

In [None]:
#Generate confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat_logmodel = confusion_matrix(y_test,pred)
conf_mat_logmodel

# Random Forest

In [None]:
#Fit the Random Forest Classification Model
from sklearn.ensemble import RandomForestClassifier
rfmodel = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
rfmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
rf_pred = rfmodel.predict(X_test)

# Find Accuracy using accuracy_score method
rf_accuracy = round(metrics.recall_score(y_test, rf_pred) * 100, 2)
print(rf_accuracy)

In [None]:
#Generate confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat_logmodel = confusion_matrix(y_test,pred)
conf_mat_logmodel

# Compare Model Accuracy

In [None]:
# Compare Several models according to their Accuracies
Model_Comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Support Vector Machine', 'K-Nearest Neighbor', 
              'Decision Tree', 'Random Forest'],
    'Score': [logmodel_accuracy, svc_accuracy, knn_accuracy, 
              dt_accuracy, rf_accuracy]})
Model_Comparison_df = Model_Comparison.sort_values(by='Score', ascending=False)
Model_Comparison_df = Model_Comparison_df.set_index('Score')
Model_Comparison_df.reset_index()

# Predict Probability of Churn

In [None]:
# Predict the probability of Churn of each customer
churn_dataset['Probability_of_Churn'] = rfmodel.predict_proba(churn_dataset[X_test.columns])[:,1]

# Create a Dataframe showcasing probability of Churn of each customer
churn_dataset[['customer_name','Probability_of_Churn']].head()

In [None]:
churn_dataset[:50]

# RFECV

## Logistic Regression

In [None]:
#Create Feature variable X and Target variable y
y = churn_dataset['Churn']
X = churn_dataset.drop(['Churn','customer_name','last_purchase_in_days','Probability_of_Churn'], axis = 1)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [None]:
log_reg_rfecv = RFECV(estimator=log_reg, step=1, cv=10, 
                      scoring='f1')
log_reg_rfecv = log_reg_rfecv.fit(X, y)
print('Optimal number of features :', log_reg_rfecv.n_features_)
print('Best features :', X.columns[log_reg_rfecv.support_])

In [None]:
log_reg_rfecv.grid_scores_

In [None]:
pd.DataFrame(X.columns[log_reg_rfecv.support_])

In [None]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(log_reg_rfecv.grid_scores_) + 1), log_reg_rfecv.grid_scores_)
plt.show()

In [None]:
print(np.where(log_reg_rfecv.support_ == False)[0])
#X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X.drop(X.columns[np.where(log_reg_rfecv.support_ == False)[0]], axis=1, inplace=True)

### Evaluate Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=8)

In [None]:
X_train.shape

In [None]:
log_reg_rfecv_model = log_reg_rfecv.fit(X_train, y_train)

### Confusion Matrix

In [None]:
from sklearn import metrics
y_pred = log_reg_rfecv_model.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Confusion Matrix');

### Precision, Recall, F-Score, Support

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, log_reg_rfecv_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, log_reg_rfecv_model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC_RFECV')
plt.show()

## SVC

In [None]:
#Create Feature variable X and Target variable y
y = churn_dataset['Churn']
X = churn_dataset.drop(['Churn','customer_name','last_purchase_in_days','Probability_of_Churn'], axis = 1)

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel="linear")

In [None]:
svc_rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10), scoring='f1')
svc_rfecv.fit(X, y) 
print('Optimal number of features :', svc_rfecv.n_features_)
print('Best features :', X.columns[svc_rfecv.support_])

In [None]:
svc_rfecv.grid_scores_

In [None]:
plt.figure(figsize=(16, 9))
plt.title('SVC - RFECV', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(svc_rfecv.grid_scores_) + 1), svc_rfecv.grid_scores_, color='#303F9F', linewidth=3)
plt.show()

In [None]:
print(np.where(svc_rfecv.support_ == False)[0])
#X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X.drop(X.columns[np.where(svc_rfecv.support_ == False)[0]], axis=1, inplace=True)
#drop the less contributioning variables

### Evaluate Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=8)

In [None]:
X_train.shape

In [None]:
svc_rfecv_model = svc_rfecv.fit(X_train, y_train)

### Confusion Matrix

In [None]:
from sklearn import metrics
y_pred = svc_rfecv_model.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Confusion Matrix');

### Precision, Recall, F-Score, Support

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

SVC does not support ROC Curve.

## Decision Tree

In [None]:
#Create Feature variable X and Target variable y
y = churn_dataset['Churn']
X = churn_dataset.drop(['Churn','customer_name','last_purchase_in_days','Probability_of_Churn'], axis = 1)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_rfecv = RFECV(DecisionTreeClassifier(), cv=10, scoring='f1')
dt_rfecv = dt_rfecv.fit(X, y)
print('Optimal number of features :', dt_rfecv.n_features_)
print('Best features :', X.columns[dt_rfecv.support_])

In [None]:
dt_rfecv.grid_scores_

In [None]:
plt.figure(figsize=(16, 9))
plt.title('Decision Tree - RFECV', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(dt_rfecv.grid_scores_) + 1), dt_rfecv.grid_scores_, color='#303F9F', linewidth=3)
plt.show()

In [None]:
print(np.where(dt_rfecv.support_ == False)[0])
#X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X.drop(X.columns[np.where(dt_rfecv.support_ == False)[0]], axis=1, inplace=True)

In [None]:
dt_rfecv.estimator_.feature_importances_

In [None]:
dset = pd.DataFrame()
dset['attr'] = X.columns
dset['importance'] = dt_rfecv.estimator_.feature_importances_
dset = dset.sort_values(by='importance', ascending=False)

plt.figure(figsize=(16, 10))
plt.barh(y=dset['attr'], width=dset['importance'], color='#1976D2')
plt.title('Decision Tree - RFECV - Feature importances', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Importance', fontsize=14, labelpad=20)
plt.show()

### Evaluate Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=8)

In [None]:
X_train.shape

In [None]:
dt_rfecv_model = dt_rfecv.fit(X_train, y_train)

### Confusion Matrix

In [None]:
from sklearn import metrics
y_pred = dt_rfecv_model.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Confusion Matrix');

### Precision, Recall, F-Score and Support

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, dt_rfecv_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, dt_rfecv_model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Tree Classifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC_DT_RFE')
plt.show()

## Gradient Boosting Classifier

In [None]:
#Create Feature variable X and Target variable y
y = churn_dataset['Churn']
X = churn_dataset.drop(['Churn','customer_name','last_purchase_in_days','Probability_of_Churn'], axis = 1)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=101)

In [None]:
gb_rfecv = RFECV(estimator=gbc, step=1, cv=StratifiedKFold(10), scoring='f1')
gb_rfecv.fit(X, y) 
print('Optimal number of features :', gb_rfecv.n_features_)
print('Best features :', X.columns[gb_rfecv.support_])

In [None]:
gb_rfecv.grid_scores_

In [None]:
plt.figure(figsize=(16, 9))
plt.title('Gradient Boost - RFECV', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(gb_rfecv.grid_scores_) + 1), gb_rfecv.grid_scores_, color='#303F9F', linewidth=3)
plt.show()

In [None]:
print(np.where(gb_rfecv.support_ == False)[0])
#X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X.drop(X.columns[np.where(gb_rfecv.support_ == False)[0]], axis=1, inplace=True)

In [None]:
gb_rfecv.estimator_.feature_importances_

### Evaluate Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=8)

In [None]:
X_train.shape

In [None]:
gb_rfecv_model = gb_rfecv.fit(X_train, y_train)

### Confusion Matrix

In [None]:
from sklearn import metrics
y_pred = gb_rfecv_model.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Confusion Matrix');

### Precision, Recall, F-Score and Support

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, gb_rfecv_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, gb_rfecv_model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Graident Boosting Classifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC_GB_RFE')
plt.show()

## XGBoost

In [None]:
churn_dataset.columns

In [None]:
#Create Feature variable X and Target variable y
y = churn_dataset['Churn']
X = churn_dataset.drop(['Churn','customer_name','last_purchase_in_days','Probability_of_Churn'], axis = 1)

In [None]:
#Split the data into training set (80%) and test set (20%)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 77)

In [None]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(verbosity=2,
                    learning_rate = 0.01,
                      max_depth=5, #increasing the number does not improve anything #start from 3 increase scale_pos_weight=1, #1
                      min_child_weight=0, #0 to inf
                      gamma=0, #(0,1,5)learning_rate=0.1,  #optimal #0.01 to 0.1
                      colsample_bytree = 0.8, #0.3 to 0.8 if many columns, if few 0.8 to 1
                      subsample = 0.8, #0.8 to 1
                      scale_pos_weight = 1, 
                      reg_alpha = 1, #0.5 (1e-5, 1e-2, 0.1, 1)
                      reg_lambda= 0.1, 
                      objective='binary:logistic', 
                      n_estimators=1000 
                    ) 

In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test1, scoring='f1',iid=False, cv=5)
gsearch1.fit(X,y)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[0,1,2]
}

gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=9,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test2, scoring='f1',iid=False, cv=5)
gsearch2.fit(X,y)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test2b = {'max_depth':[1,2,4,6,8],
 'min_child_weight':[0,1,2,3,4,5,6,7,8,9,10]
}

gsearch2b = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test2b, scoring='f1',iid=False, cv=5)
gsearch2b.fit(X,y)
gsearch2b.cv_results_, gsearch2b.best_params_, gsearch2b.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test3, scoring='f1',iid=False, cv=5)
gsearch3.fit(X,y)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test3b = {
 'gamma':[i/10.0 for i in range(0,10)]
}

gsearch3b = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test3b, scoring='f1',iid=False, cv=5)
gsearch3b.fit(X,y)
gsearch3b.cv_results_, gsearch3b.best_params_, gsearch3b.best_score_

In [None]:
param_test3c = {
 'gamma':[i for i in range(0,10)]
}

gsearch3c = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test3c, scoring='f1',iid=False, cv=5)
gsearch3c.fit(X,y)
gsearch3c.cv_results_, gsearch3c.best_params_, gsearch3c.best_score_

In [None]:
param_test3d = {
 'gamma':[i for i in range(0,100,10)]
}

gsearch3d = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=14,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test3d, scoring='f1',iid=False, cv=5)
gsearch3d.fit(X,y)
gsearch3d.cv_results_, gsearch3d.best_params_, gsearch3d.best_score_

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)], #test values of 0.6 to 0.9
 'colsample_bytree':[i/10.0 for i in range(6,10)] #test values of 0.6 to 0.9
}

gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test4, scoring='f1',iid=False, cv=5)
gsearch4.fit(X,y)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test4b = {
 'subsample':[i/100.0 for i in range(50,75,5)],
 'colsample_bytree':[i/100.0 for i in range(50,75,5)]
}

gsearch4b = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test4b, scoring='f1',iid=False, cv=5)
gsearch4b.fit(X,y)
gsearch4b.cv_results_, gsearch4b.best_params_, gsearch4b.best_score_

In [None]:
param_test4c = {'subsample':[i/100.0 for i in range(40,100,5)],
 'colsample_bytree':[i/100.0 for i in range(40,80,5)]
}

gsearch4c = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.5,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test4c, scoring='f1',iid=False, cv=5)
gsearch4c.fit(X,y)
gsearch4c.cv_results_, gsearch4c.best_params_, gsearch4c.best_score_

In [None]:
param_test4d = {
 'colsample_bytree':[i/100.0 for i in range(0,100,5)]
}

gsearch4d = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.65, 
                                                  colsample_bytree=0.4,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test4d, scoring='f1',iid=False, cv=5)
gsearch4d.fit(X,y)
gsearch4d.cv_results_, gsearch4d.best_params_, gsearch4d.best_score_

In [None]:
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.65, 
                                                  colsample_bytree=0,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test5, scoring='f1',iid=False, cv=5)
gsearch5.fit(X,y)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

In [None]:
param_test5b = {
 'reg_alpha':[1e-5, 1e-4, 1e-3, 1e-2, 10,0.05,5e-3,5e-2]
}

gsearch5b = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.65, 
                                                  colsample_bytree=0,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test5b, scoring='f1',iid=False, cv=5)
gsearch5b.fit(X,y)
gsearch5b.cv_results_, gsearch5b.best_params_, gsearch5b.best_score_

In [None]:
param_test5c = {
 'reg_alpha':[1e-5, 1e-6,1e-7,1e-8,1e-9,1e-10]
}

gsearch5c = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.65, 
                                                  colsample_bytree=0,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test5c, scoring='f1',iid=False, cv=5)
gsearch5c.fit(X,y)
gsearch5c.cv_results_, gsearch5c.best_params_, gsearch5c.best_score_

In [None]:
param_test6 = {
 'reg_lambda':[1e-5,0.001, 0.1, 0, 0.3,0.8]
}

gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.65, 
                                                  colsample_bytree=0, reg_alpha=1e-5,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test6, scoring='f1',iid=False, cv=5)
gsearch6.fit(X,y)
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_

Reg lambda performs worse.

In [None]:
param_test7 = {
 'learning_rate':[0,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7]
}

gsearch7 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.65, 
                                                  colsample_bytree=0, reg_alpha=1e-5,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27), 
                        param_grid = param_test7, scoring='f1',iid=False, cv=5)
gsearch7.fit(X,y)
gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_

### Final XGB Model

In [None]:
xgb = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=8,
                                                  min_child_weight=0, gamma=0, subsample=0.65, 
                                                  colsample_bytree=0, reg_alpha=1e-5,
                                                  objective= 'binary:logistic', scale_pos_weight=1, 
                                                  seed=27)

### Feature Importance

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

xgb_rfecv = RFECV(estimator=xgb, step=1, cv=StratifiedKFold(10), scoring='f1',n_jobs=-1) 
# step means how many features are removed at each iteration
xgb_rfecv.fit(X, y) 
print('Optimal number of features :', xgb_rfecv.n_features_)
print('Best features :', X.columns[xgb_rfecv.support_])

In [None]:
xgb_rfecv.grid_scores_

In [None]:
plt.figure(figsize=(16, 9))
plt.title('XG Boost - RFECV', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(xgb_rfecv.grid_scores_) + 1), xgb_rfecv.grid_scores_, color='#303F9F', linewidth=3)
plt.show()

In [None]:
print(np.where(xgb_rfecv.support_ == False)[0])
#X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X.drop(X.columns[np.where(xgb_rfecv.support_ == False)[0]], axis=1, inplace=True)

In [None]:
xgb_rfecv.estimator_.feature_importances_

In [None]:
dset = pd.DataFrame()
dset['attr'] = X.columns
dset['importance'] = xgb_rfecv.estimator_.feature_importances_
dset = dset.sort_values(by='importance', ascending=False)

plt.figure(figsize=(16, 10))
plt.barh(y=dset['attr'], width=dset['importance'], color='#1976D2')
plt.title('XG Boost - RFECV - Feature importances', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Importance', fontsize=14, labelpad=20)
plt.show()

### Evaluate Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=8)

In [None]:
X_train.shape

In [None]:
X_test.shape

Fit model on only 6 features

In [None]:
xgb_rfecv_model = xgb_rfecv.fit(X_train, y_train)

### Confusion Matrix

In [None]:
from sklearn import metrics
y_pred = xgb_rfecv_model.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Confusion Matrix');

### Precision, Recall, F-Score, Support

In [None]:
from sklearn.metrics import classification_report
classification_report=classification_report(y_test, y_pred)
print(classification_report)
#pd.DataFrame(classification_report)

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, xgb_rfecv_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, xgb_rfecv.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC_XGB_RFE')
plt.show()

## Random Forest Classifier

In [None]:
#Create Feature variable X and Target variable y
y = churn_dataset['Churn']
X = churn_dataset.drop(['Churn','customer_name','last_purchase_in_days','Probability_of_Churn'], axis = 1)

In [None]:
rf_rfecv = RFECV(RandomForestClassifier(), cv=10, scoring='f1')
rf_rfecv = rf_rfecv.fit(X, y)
print('Optimal number of features :', rf_rfecv.n_features_)
print('Best features :', X.columns[rf_rfecv.support_])

In [None]:
rf_rfecv.grid_scores_

In [None]:
plt.figure(figsize=(16, 9))
plt.title('Decision Tree - RFECV', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(rf_rfecv.grid_scores_) + 1), rf_rfecv.grid_scores_, color='#303F9F', linewidth=3)
plt.show()

In [None]:
print(np.where(rf_rfecv.support_ == False)[0])
#X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X.drop(X.columns[np.where(rf_rfecv.support_ == False)[0]], axis=1, inplace=True)

In [None]:
rf_rfecv.estimator_.feature_importances_

### Evaluate Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=8)

In [None]:
X_train.shape

In [None]:
rf_rfecv_model = rf_rfecv.fit(X_train, y_train)

### Confusion Matrix

In [None]:
from sklearn import metrics
y_pred = rf_rfecv_model.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Confusion Matrix');

### Precision, Recall, F-Score and Support

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, rf_rfecv_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, rf_rfecv_model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest Classifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC_RF_RFE')
plt.show()