In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('e:/data/churn.csv')

In [None]:
# Assuming we already have done detailed analysis on this data

In [None]:
# Class distribution of the taget variable

In [None]:
df['Exited'].value_counts()

In [None]:
x = df.iloc[:,2:12]

In [None]:
y = df['Exited']

In [None]:
dummies = pd.get_dummies(data=df, columns=['Geography','Gender'], drop_first=True)

In [None]:
dummies[:2]

In [None]:
dummies.columns

In [None]:
x=pd.concat([x,dummies[['Geography_Germany', 'Geography_Spain', 'Gender_Male']]], axis=1)

In [None]:
x.drop(['Geography','Gender'], axis=1, inplace=True)

In [None]:
x.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
ytrain.value_counts()

In [None]:
6346/8000

In [None]:
1654/2000

In [None]:
# Generalised Linear models

In [None]:
# Logistic regression --> Logit function (sigmoid function) --> odds

In [None]:
p/(1-p)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
glm = LogisticRegression(max_iter=10000,solver="liblinear")

In [None]:
lrmodel = glm.fit(xtrain,ytrain)

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
# ---- Predict probabilities ----
y_prob = lrmodel.predict_proba(xtest)[:, 1]
fpr, tpr, thresholds = roc_curve(ytest, y_prob)
roc_auc = auc(fpr, tpr)
print("AUC Score:", roc_auc)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guess')
plt.xlabel("False Positive Rate") 
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
lrpred= lrmodel.predict(xtest)

In [None]:
pd.crosstab(ytest,lrpred)

In [None]:
(1591+13)/2000

In [None]:
13/(13+26)

In [None]:
13/(13+370)

In [None]:
lrmodel.predict(xtest[:1])

In [None]:
lrmodel.predict_proba(xtest[:1])

In [None]:
ytest[:1]

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
dtree = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5, max_leaf_nodes=8, random_state=8)

In [None]:
dtreemodel = dtree.fit(xtrain,ytrain)

In [None]:
# Graphviz
# use dtreeviz, plotly, dtreeplt for advanced visualisation

In [None]:
from dtreeplt import dtreeplt

In [None]:
x.columns

In [None]:
plt.figure(figsize=(20, 10))
plot_tree(
    dtreemodel,
    filled=True,
    feature_names=x.columns,
    class_names=['pos','neg'],
    rounded=True,
    fontsize=10
)
plt.show()

In [None]:
dtree_pred = dtreemodel.predict(xtest)

In [None]:
pd.crosstab(ytest,dtree_pred)

In [None]:
92/(92+24)

In [None]:
# Precision --> tp/(tp+fp)
146/(146+63)

In [None]:
# Recall --> tp/(tp+fn)
146/(237+146)

In [None]:
# Accuracy
(146+1554)/2000

In [None]:
# Pruning

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [None]:
from sklearn import tree

In [None]:
params = {'max_depth': [4,6,8,10],
         'min_samples_split': [2,5,8],
         'min_samples_leaf': [4,10]}

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=dtreemodel,param_grid=params)
gcv.fit(xtrain,ytrain)

In [None]:
gcv.cv_results_

In [None]:
gcv.best_params_

In [None]:
gcv.best_score_

In [None]:
path = dtree.cost_complexity_pruning_path(xtrain,ytrain)

In [None]:
alpha = path['ccp_alphas']

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report

In [None]:
ytrain.pred

In [None]:
alpha

In [None]:
acc_train, acc_test = [],[]

for i in alpha:
    tree = DecisionTreeClassifier(ccp_alpha=i)
    tree.fit(xtrain,ytrain)
    ytrain_pred = tree.predict(xtrain)
    ytest_pred = tree.predict(xtest)
    
    acc_train.append(accuracy_score(ytrain,ytrain_pred))
    acc_test.append(accuracy_score(ytest,ytest_pred))
    acc_train.append(precision_score(ytrain,ytrain_pred))
    acc_test.append(precision_score(ytest,ytest_pred))

In [None]:
(92)/(92+291)

In [None]:
acc_train

In [None]:
acc_test

In [None]:
m1 = DecisionTreeClassifier(ccp_alpha=0.00522287)
m1.fit(xtrain,ytrain)

In [None]:
p=m1.predict(xtest)

In [None]:
pd.crosstab(ytest,p)

In [None]:
print(classification_report(ytest,p))

In [None]:
precision_score(ytest,p)