# Churn Prediction
Churn Prediction is one of the most popular big data use cases in the business world, and it helps identify customers who are most likely to cancel a subscription to a service.

(If Churn = 1, customer canceled subscription. If Churn = 0, customer continues subscription.)

# Data Used

For this analysis, the open source Cell2Cell data prepared by the Teradata center for customer relationship management at Duke University was obtained from Kaggle.

Our data consists of 51,047 samples / rows and 58 features / columns.

# Purpose of the study

Tried to predict and analyzed Customer Loss with classification algorithms.

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
train=pd.read_csv('../input/datasets-for-churn-telecom/cell2celltrain.csv')
test=pd.read_csv('../input/datasets-for-churn-telecom/cell2cellholdout.csv')

In [None]:
test.head()

In [None]:
train.describe()

In [None]:
for i in train.columns:
      if train[i].dtype=='object':
            print(pd.DataFrame(train[i].value_counts()))

# Data Analysis


In [None]:
train.drop('CustomerID',axis=1, inplace=True)

In [None]:
churn=train[train['Churn']=='Yes'].shape[0]
non_churn= train[train['Churn']=='No'].shape[0]

slices=[churn,non_churn]
labels=['Churn','Non Churn']
explode=[0.1,0]

fig1, ax1 = plt.subplots(figsize=(6, 5))
ax1.pie(slices, explode=explode, labels=labels,autopct='%1.1f%%',shadow=True, startangle=270)
ax1.axis('equal')  
plt.show()


# Customer churn is 28.8%. The data is unbalanced.

In [None]:
fig = plt.subplots(figsize=(8, 5))
plt.hist(train[train['Churn']=='No']['MonthlyRevenue'],bins=[0,50,100,150,200,250,300,350],alpha=0.5, label='Non Churn customer')
plt.hist(train[train['Churn']=='Yes']['MonthlyRevenue'],bins=[0,50,100,150,200,250,300,350],alpha=0.5,label='Churn customer')
plt.ylabel("Müşteri Sayısı")
plt.xlabel("MonthlyRevenue")
plt.legend()
plt.tight_layout()
plt.title('Aylık Gelire Göre Dağılım')
plt.show()


In [None]:
tenure_churn = train.MonthsInService[train.Churn == 'Yes'].value_counts()
tenure_no_churn = train.MonthsInService[train.Churn == 'No'].value_counts()
tenure = pd.merge(tenure_churn.reset_index(), tenure_no_churn.reset_index(), on='index')
tenure = tenure.sort_values(by='index')
tenure = tenure.reset_index().drop(columns='level_0')
tenure.columns

In [None]:
plt.figure(figsize=(10,10))
plt.grid(True)
sns.pointplot(x=tenure.index, y=tenure.MonthsInService_x, color='red',label='Churn customer')
sns.pointplot(x=tenure.index, y=tenure.MonthsInService_y, color='blue',label='Non Churn customer')
plt.xticks(rotation=90)
plt.ylabel("# of customer ")
plt.xlabel("MonthsInService")
plt.legend()
plt.tight_layout()
plt.title('Months in Service Dist.')
plt.show()

In [None]:
fig = plt.subplots(figsize=(8, 5))
plt.hist(train[train['Churn']=='No']['MonthlyMinutes'],bins=[0,20,30,40,50],alpha=0.5, label='Non Churn customer')
plt.hist(train[train['Churn']=='Yes']['MonthlyMinutes'],bins=[0,20,30,40,50],alpha=0.5,label='Churn customer')
plt.ylabel("# of customer ")
plt.xlabel("MonthlyMinutes")
plt.legend()
plt.tight_layout()
plt.title('Monthly Minutes Dist.')
plt.show()

In [None]:
train['HandsetPrice'] = train['HandsetPrice'].replace('Unknown', np.nan)
test['HandsetPrice'] = test['HandsetPrice'].replace('Unknown', np.nan)

In [None]:
#The customer can have multiple subscriptions and can close one of the subscriptions.

train['ChurnSubs']=train['UniqueSubs']-train['ActiveSubs']


# There are no columns in the data set indicating the loss of subscription (ChurnSubs).
# This column is obtained when active subscriptions are subtracted from the number of subscriptions owned by the customer.

slices=[train['ActiveSubs'].sum(),train['ChurnSubs'].sum()] 
labels=['Active Subs','Churn Subs']
explode=[0,0.1]

fig1, ax1 = plt.subplots(figsize=(7, 6))
ax1.pie(slices, explode=explode, labels=labels,autopct='%1.1f%%',shadow=True, startangle=270)
ax1.axis('equal')  
plt.show()


# Pie chart used to show how many subscribers are active and how many have lost subscribers

In [None]:
retention_df=train[train['MadeCallToRetentionTeam']=='Yes']
print ("Number of customers making calls to retention team: {}".format(train[train['MadeCallToRetentionTeam']=='Yes'].shape[0]))
print("Number of customers lost after calls : {}".format(train[(train['MadeCallToRetentionTeam']=='Yes') & (train['Churn']=='Yes')].shape[0]))
print("Number of customers not lost after making calls : {}".format(train[(train['MadeCallToRetentionTeam']=='Yes') & (train['Churn']=='No')].shape[0]))
print ("Success rate of retention calls % : {}".format(round(100*560/1017)))

slices=[retention_df[retention_df['Churn']=='No'].shape[0],retention_df[retention_df['Churn']=='Yes'].shape[0]]
labels=['Non Churn','Churn']
explode=[0,0.1]

fig1, ax1 = plt.subplots(figsize=(7, 6))
ax1.pie(slices, explode=explode, labels=labels,autopct='%1.1f%%',shadow=True, startangle=270)
ax1.axis('equal')  
plt.show()


In [None]:
retention_offer=train[train['RetentionOffersAccepted']>0]

In [None]:
print("The number of customers who accepted the retention call offer : {}".format(retention_offer.shape[0]))
print("Number of customers lost after accepting the retention call: {}".format(retention_offer[retention_offer['Churn']=='Yes'].shape[0]))
print("Number of customers not lost after accepting the retention call : {}".format(retention_offer[retention_offer['Churn']=='No'].shape[0]))
print("Success rate of the retention call proposal: {}".format(retention_offer[retention_offer['Churn']=='No'].shape[0]/retention_offer.shape[0]))

slices=[retention_offer[retention_offer['Churn']=='No'].shape[0],retention_offer[retention_offer['Churn']=='Yes'].shape[0]]
labels=['Non Churn','Churn']
explode=[0,0.1]

fig1, ax1 = plt.subplots(figsize=(7, 6))
ax1.pie(slices, explode=explode, labels=labels,autopct='%1.1f%%',shadow=True, startangle=270)
ax1.axis('equal')  
plt.show()



In [None]:
sns.set(style="darkgrid")
sns.catplot(x="IncomeGroup", hue="Churn", kind="count", edgecolor=".6",
            data=train,height=6, aspect=2);

In [None]:
sns.set(style="darkgrid")
sns.catplot(x="CreditRating", hue="Churn", kind="count", edgecolor=".6",
            data=train,height=5, aspect=2);

In [None]:
sns.set(style="darkgrid")
sns.catplot(x="PrizmCode", hue="Churn", kind="count", edgecolor=".6",
            data=train,height=5, aspect=2);


In [None]:
sns.set(style="darkgrid")
sns.catplot(x="Occupation", hue="Churn", kind="count", edgecolor=".6",
            data=train,height=6, aspect=2);


In [None]:
print("Categorical Var. : {}".format(train.select_dtypes(exclude=['int', 'float']).columns))

print("Numeric Var. : {}".format(train.select_dtypes(include=['int', 'float']).columns))

# Missing Values

In [None]:
train.columns[train.isnull().any()]

In [None]:
test.columns[test.isnull().any()]

In [None]:
train_missings=train.filter(['MonthlyRevenue', 'MonthlyMinutes', 'TotalRecurringCharge',
       'DirectorAssistedCalls', 'OverageMinutes', 'RoamingCalls',
       'PercChangeMinutes', 'PercChangeRevenues', 'ServiceArea', 'Handsets',
       'HandsetModels', 'CurrentEquipmentDays', 'AgeHH1', 'AgeHH2',
       'HandsetPrice'], axis=1)

In [None]:
test_missings=test.filter(['MonthlyRevenue', 'MonthlyMinutes', 'TotalRecurringCharge',
       'DirectorAssistedCalls', 'OverageMinutes', 'RoamingCalls',
       'PercChangeMinutes', 'PercChangeRevenues', 'ServiceArea', 'AgeHH1',
       'AgeHH2', 'HandsetPrice'],axis=1)

In [None]:
msno.matrix(train_missings);

In [None]:
msno.heatmap(train_missings); #A value near -1 means if one variable appears then the other variable is very likely to be missing.
                    #A value near 0 means there is no dependence between the occurrence of missing values of two variables.
                    #A value near 1 means if one variable appears then the other variable is very likely to be present.


In [None]:
msno.bar(train_missings);

In [None]:
train_missings.describe()

In [None]:
def values_table(train_missings):
        mis_val = train_missings.isnull().sum()
        mis_val_percent = 100 * train_missings.isnull().sum() / len(train_missings)
        mean=train_missings.mean()
        median=train_missings.median()
        total=train_missings.count()
        zeros=train_missings[train_missings==0].count()
        table = pd.concat([ mis_val,mis_val_percent, mean, median, total, zeros], axis=1)
        table = table.rename(columns = {0 :'Train Missing Values', 1 : '% of Train Total Values', 2 : 'Mean'
                                       , 3 : 'Median' , 4 : '# of Values' , 5 : '# of Zeros'})
        table['Data Type'] = train_missings.dtypes
        table = table[table.iloc[:,1] != 0].sort_values('% of Train Total Values', ascending=False).round(1)
        print ("There are " + str(train.shape[1]) + " columns and " + str(train.shape[0]) + " rows in the dataset.\n"      
             + str(table.shape[0]) + " of these columns have missing variables.")
        return table

values_table(train_missings)

In [None]:
train_1=train.copy()
test_1=test.copy()

In [None]:
churn=train_1[train_1['Churn']=='Yes']
non_churn= train_1[train_1['Churn']=='No']
churn.describe()

In [None]:
non_churn.describe()

In [None]:
 
train_1['MonthlyRevenue'].fillna(train_1.groupby('Churn')['MonthlyRevenue'].transform('median'), inplace=True)

In [None]:
test_1['MonthlyRevenue'].fillna(test_1['MonthlyRevenue'].mean(),inplace=True)

In [None]:
#Too many missing variables (56.4%) so we dropped it

train_1.drop('HandsetPrice',axis=1, inplace=True)
test_1.drop('HandsetPrice',axis=1, inplace=True)

In [None]:
train_1.drop('MadeCallToRetentionTeam',axis=1, inplace=True)
test_1.drop('MadeCallToRetentionTeam',axis=1, inplace=True)

In [None]:
train_1.drop('MaritalStatus',axis=1, inplace=True)
test_1.drop('MaritalStatus',axis=1, inplace=True)

In [None]:
train_1.drop('Homeownership',axis=1, inplace=True)
test_1.drop('Homeownership',axis=1, inplace=True)

In [None]:
#Although HandsetModels and Handsets appear numeric, they are categorical variables, so it is more correct to fill them with mode.
train_1[train_1['Handsets'].isna()][['HandsetModels','Handsets','CurrentEquipmentDays']] 


In [None]:
print("Handsets:",train_1.Handsets.mode()[0])
print("HandsetModels:",train_1.HandsetModels.mode()[0])
print("CurrentEquipmentDays:",train_1.CurrentEquipmentDays.mode()[0])

In [None]:
train_1['Handsets'].fillna(train_1['Handsets'].mode()[0],inplace=True)

In [None]:
train_1['HandsetModels'].fillna(train_1['HandsetModels'].mode()[0],inplace=True)

In [None]:
train_1['CurrentEquipmentDays'].value_counts()

In [None]:
train_1['CurrentEquipmentDays'].fillna(train_1['CurrentEquipmentDays'].mode()[0],inplace=True)

In [None]:
train_1['ServiceArea'].fillna(train_1['ServiceArea'].mode()[0],inplace=True)
test_1['ServiceArea'].fillna(test_1['ServiceArea'].mode()[0],inplace=True)

In [None]:
train_1['AgeHH1'].fillna(value=0,inplace=True)
test_1['AgeHH1'].fillna(value=0,inplace=True)

In [None]:
train_1['AgeHH2'].fillna(value=0,inplace=True)
test_1['AgeHH2'].fillna(value=0,inplace=True)

In [None]:
display(train_1.groupby(['Churn'],as_index=False)['MonthlyMinutes'].median().style.hide_index())

In [None]:
train_1['MonthlyMinutes'].fillna(train_1.groupby('Churn')['MonthlyMinutes'].transform('median'), inplace=True)

In [None]:
test_1['MonthlyMinutes'].fillna(test_1['MonthlyMinutes'].median(), inplace=True)

In [None]:
train_1['TotalRecurringCharge'].fillna(train_1['TotalRecurringCharge'].median(),inplace=True)
test_1['TotalRecurringCharge'].fillna(test_1['TotalRecurringCharge'].median(),inplace=True)

In [None]:
train_1['PercChangeRevenues'].fillna(train_1['PercChangeRevenues'].median(),inplace=True)
test_1['PercChangeRevenues'].fillna(test_1['PercChangeRevenues'].median(),inplace=True)

In [None]:
train_1['PercChangeMinutes'].fillna(train_1['PercChangeMinutes'].median(),inplace=True)
test_1['PercChangeMinutes'].fillna(train_1['PercChangeMinutes'].median(),inplace=True)

In [None]:
train_1['RoamingCalls'].fillna(value=0,inplace=True)
test_1['RoamingCalls'].fillna(value=0,inplace=True)

In [None]:
train_1['OverageMinutes'].fillna(value=0,inplace=True)
test_1['OverageMinutes'].fillna(value=0,inplace=True)

In [None]:
train_1['DirectorAssistedCalls'].fillna(value=0,inplace=True)
test_1['DirectorAssistedCalls'].fillna(value=0,inplace=True)

In [None]:
train_1.isnull().sum()

# Outliers Detection

In [None]:
print ("There are " + str(train_1.shape[1]) + " columns and  " + str(train_1.shape[0]) + " rows in the train data.\n")
print("There are " + str(test_1.shape[1]) + " columns and " + str(test_1.shape[0]) + " rows in the test data.\n")

In [None]:
from scipy.stats import zscore
from scipy import stats 

train_yeni=train_1.select_dtypes(include=['int', 'float'])
train_yeni.head()

In [None]:
train_new = train_yeni.copy()
outliers={}
for col in train_yeni:
    z_score = zscore(train_yeni[col])
    upper_replace_value = train_yeni[z_score<=3][col].max()
    lower_replace_value = train_yeni[z_score>=-3][col].min()
    train_new = train_yeni.copy()
    outliers[col] = len(train_new[col][(train_new[col] > upper_replace_value)| (train_new[col] < lower_replace_value)])

In [None]:
outliers_zscore=pd.DataFrame(outliers.items(),columns=['Feature','Outliers'])
outliers_zscore

In [None]:
train_new1 = train_yeni.copy()
outliers_1={}
for col in train_yeni:
    q1 = train_yeni[col].quantile(q=0.15)
    q3 = train_yeni[col].quantile(q=0.85)
    iqr = stats.iqr(train_yeni[col]) 
    train_new1 = train_yeni.copy()
    upper_limit = q3 + 1.5*iqr
    lower_limit = q1 - 1.5*iqr
    outliers_1[col] = len(train_new1[col][(train_new1[col] > upper_limit)| (train_new1[col] < lower_limit)])

In [None]:
outliers_iqr=pd.DataFrame(outliers_1.items(),columns=['Feature','Outliers'])
outliers_iqr

In [None]:
train_new2 = train_yeni.copy()
outliers_2={}
for col in train_yeni:
    lower_lim = train_yeni[col].quantile(q=0.01)
    upper_lim = train_yeni[col].quantile(q=0.99)
    outliers_2[col] = len(train_new2[col][(train_new2[col] > upper_lim)| (train_new2[col] < lower_lim)])

In [None]:
outliers_hard=pd.DataFrame(outliers_2.items(),columns=['Feature','Outliers'])
outliers_hard

In [None]:
plt.figure(figsize=(18,25))
for num,col in enumerate(train_yeni.select_dtypes(exclude=['object']).columns,1):
    plt.subplot(8, 5, num)
    sns.boxplot(train_yeni[col])
    plt.tight_layout()

# **Label Encoding**

In [None]:
train_df=train_1.copy()
for i in train_df.columns:
      if train_df[i].dtype=='object':
            print(pd.DataFrame(train_df[i].value_counts()))

In [None]:
train_1.drop('ServiceArea',axis=1, inplace=True)
test_1.drop('ServiceArea',axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
def FunLabelEncoder(train_df):
    for c in train_df.columns:
        if train_df.dtypes[c] == object:
            enc.fit(train_df[c].astype(str))
            train_df[c] = enc.transform(train_df[c].astype(str))
    return train_df

In [None]:
train_df = FunLabelEncoder(train_df)
train_df.info()

In [None]:
test_df=test_1.copy()
enc = LabelEncoder()
def FunLabelEncoder(tes_df):
    for c in test_df.columns:
        if test_df.dtypes[c] == object:
            enc.fit(test_df[c].astype(str))
            test_df[c] = enc.transform(test_df[c].astype(str))
    return test_df

In [None]:
test_df = FunLabelEncoder(test_df)
test_df.info()

In [None]:
cor = train_df.corr()
f, ax = plt.subplots(figsize = (15,15))
sns.heatmap(cor, fmt=".4f", linewidths=0.5, ax=ax);

## Feature Scaling






In [None]:
y = train_df['Churn']
X= train_df.drop(columns=['Churn'],axis=1)

X.head()

In [None]:
y

In [None]:
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import StratifiedKFold 

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot



# Train/Test Split,Feature Scaling

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9,stratify=y)

columns = X_train.columns

scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])
X_test[X_test.columns] = scaler.transform(X_test[X_test.columns])

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

#Undersampling:Trying to balance data 

undersample = RandomUnderSampler(sampling_strategy='majority')
X_new, y_new = undersample.fit_resample(X_train, y_train)

print(Counter(y_train))
print(Counter(y_new))

# Classification Algorithms and Optimization

In [None]:
#Algorithm trials with under sampling :

def run_exps(X_new , y_new, X_test, y_test):

  models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('GaussianNB', GaussianNB()),
          ('XGB', XGBClassifier()),
          ('AdaBoost',AdaBoostClassifier()),
          ('GradiendtBoost',GradientBoostingClassifier()),
           ('LightGB',lgb.LGBMClassifier())
        ]

  for name, model in models:

    clf = model.fit(X_new , y_new)
    y_pred = clf.predict(X_test)
    conf = confusion_matrix(y_test, y_pred)
    print(name)
    sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show() 
    

    print(classification_report(y_test, y_pred))
   
run_exps(X_new , y_new, X_test, y_test)

In [None]:
#Algorithm trials on training and test data

def run_exps(X_train , y_train, X_test, y_test):

  models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('GaussianNB', GaussianNB()),
          ('XGB', XGBClassifier()),
          ('AdaBoost',AdaBoostClassifier()),
          ('GradiendtBoost',GradientBoostingClassifier()),
           ('LightGB',lgb.LGBMClassifier())
        ]

  for name, model in models:

    clf = model.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    conf = confusion_matrix(y_test, y_pred)
    print(name)
    sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show() 
    

    print(classification_report(y_test, y_pred))
run_exps(X_train , y_train, X_test, y_test)

## Logistic Reg.

In [None]:
LogisticRegression()

In [None]:
class_weight= {0: 0.65, 1: 1.7}
log=LogisticRegression(class_weight='balanced',penalty='l2',max_iter=150)
clf_log=log.fit(X_train, y_train)
y_pred= clf_log.predict(X_test)
conf = confusion_matrix(y_test, y_pred)
sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
print(classification_report(y_test, y_pred))


In [None]:
clf_log=log.fit(X_train, y_train)
lr_probs =clf_log.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
y_pred= clf_log.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1, lr_auc = f1_score(y_test, y_pred), auc(lr_recall, lr_precision)

print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')

pyplot.legend()

pyplot.show()

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
y_pred1= clf_log.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred1 , average = 'macro'))

In [None]:
a=len(train_df)/2
a/len(non_churn),a/len(churn)

In [None]:
#Hyperparameter Tuning 

logreg_param = {"C":np.logspace(-3,3,7),
                    "penalty": ["l1","l2"]}

skf= StratifiedKFold(n_splits=5, shuffle=True, random_state=250) 
gsearch_log = GridSearchCV(estimator = LogisticRegression(class_weight='balanced',max_iter=150), 
                           param_grid = logreg_param, 
                           scoring='roc_auc',
                           verbose=3,
                           cv=skf)

grid_log=gsearch_log.fit(X_train,y_train)

In [None]:
print("Best: %f using %s" % (grid_log.best_score_, grid_log.best_params_))

means = grid_log.cv_results_['mean_test_score']
stds = grid_log.cv_results_['std_test_score']
params = grid_log.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Random Forest

In [None]:
RandomForestClassifier()

In [None]:
rf_param = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                }
rf=RandomForestClassifier(class_weight='balanced',max_features=3,min_samples_split=3,min_samples_leaf=3,bootstrap=False)
clf_rf=rf.fit(X_train, y_train)
y_pred= clf_rf.predict(X_test)
conf = confusion_matrix(y_test, y_pred)
sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
print(classification_report(y_test, y_pred))

In [None]:
y_pred1= clf_rf.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred1 , average = 'macro'))

## K Neighbors

In [None]:
KNeighborsClassifier()

In [None]:
knn=KNeighborsClassifier(weights='distance',n_neighbors=2)
clf_knn=knn.fit(X_train, y_train)
y_pred= clf_knn.predict(X_test)
conf = confusion_matrix(y_test, y_pred)
sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
print(classification_report(y_test, y_pred))

In [None]:
y_pred1= clf_knn.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred1 , average = 'macro'))

## AdaBoost

In [None]:
AdaBoostClassifier()

In [None]:
ada=AdaBoostClassifier()
clf_ada=ada.fit(X_train, y_train)
y_pred= clf_ada.predict(X_test)
conf = confusion_matrix(y_test, y_pred)
sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
print(classification_report(y_test, y_pred))

In [None]:
y_pred1= clf_ada.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred1 , average = 'macro'))

## XGB

In [None]:
XGBClassifier()

In [None]:
scale_pos_weight = len(non_churn)/len(churn)
scale_pos_weight

In [None]:
xgb=XGBClassifier(max_depth=3, min_child_weight= 1, scale_pos_weight=scale_pos_weight,
                  learning_rate=0.1, n_estimators= 400,sub_sample=0.9,colsample_bytree=0.9)
clf_xgb=xgb.fit(X_train, y_train)

y_pred=clf_xgb.predict(X_test)
conf = confusion_matrix(y_test, y_pred)
sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
print(classification_report(y_test, y_pred))



In [None]:
probs =clf_xgb.predict_proba(X_test)
probs = probs[:, 1]
y_pred= clf_xgb.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, probs)
lr_f1, lr_auc = f1_score(y_test, y_pred), auc(lr_recall, lr_precision)

print('XGB: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='XGB')

pyplot.xlabel('Recall')
pyplot.ylabel('Precision')

pyplot.legend()

pyplot.show()

In [None]:
y_pred1= clf_xgb.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred1 , average = 'macro'))

In [None]:
#Hyperparameter Tuning 
xgb_param = {
    'max_depth':range(1,6,2),
    'min_child_weight':range(1,6,2),
    'n_estimators':[150,200,300,400],
    'learning_rate':[0.1,0.2,0.3,0.4]
}
skf= StratifiedKFold(n_splits=5, shuffle=True, random_state=250) 
gsearch_xgb = GridSearchCV(estimator = XGBClassifier(scale_pos_weight=scale_pos_weight,
                                                     sub_sample=0.9,colsample_bytree=0.9), 
                           param_grid = xgb_param, 
                           scoring='roc_auc',
                           verbose=3,
                           cv=skf)

grid_xgb=gsearch_xgb.fit(X_train,y_train)

In [None]:
print("Best: %f using %s" % (grid_xgb.best_score_, grid_xgb.best_params_))

means = grid_xgb.cv_results_['mean_test_score']
stds = grid_xgb.cv_results_['std_test_score']
params = grid_xgb.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## LightGBM

In [None]:
lgb.LGBMClassifier()

In [None]:
lgbm=lgb.LGBMClassifier(scale_pos_weight=scale_pos_weight,learning_rate=0.3,n_estimators=70,
                   num_leaves=7,min_child_weight=0.001,min_child_samples=20)
clf_lgbm=lgbm.fit(X_train, y_train)
y_pred= clf_lgbm.predict(X_test)
conf = confusion_matrix(y_test, y_pred)
sns.heatmap(conf, annot=True, linewidth=0.7, linecolor='black', fmt='g', cmap="BuPu")
print(classification_report(y_test, y_pred))

In [None]:
y_pred1= clf_lgbm.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred1 , average = 'macro'))

In [None]:
#Hyperparameter Tuning 
lgbm_param = {
    'num_leaves': [7, 15, 31],
    'n_estimators':[100,150,200,300]
}

gsearch_lgbm = GridSearchCV(estimator = lgb.LGBMClassifier(scale_pos_weight=scale_pos_weight,
                                                           min_child_weight=0.001,min_child_samples=20), 
                            param_grid = lgbm_param, 
                            scoring='roc_auc',
                            verbose=3,
                            cv=skf)

grid_lgbm=gsearch_lgbm.fit(X_train,y_train)


In [None]:
print("Best: %f using %s" % (grid_lgbm.best_score_, grid_lgbm.best_params_))

means = grid_lgbm.cv_results_['mean_test_score']
stds = grid_lgbm.cv_results_['std_test_score']
params = grid_lgbm.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# ROC and Precision-Recall Curve





In [None]:
from sklearn.metrics import plot_roc_curve

models = [log,knn,xgb,lgbm,rf]

plt.figure(figsize = (12,6), linewidth= 1)
plt.plot([0,1], [0,1], 'k--', label = 'Random guessing: 0.5')
plt.title("ROC CURVE")
plt.legend(loc="lower right")
ax = plt.gca()

for i in models:
  plot_roc_curve(i, X_test, y_test,ax=ax)

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

models = [log,knn,xgb,lgbm,rf]

plt.figure(figsize = (12,6), linewidth= 1)
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.title("Precision Recall Curve")
plt.legend(loc="upper left")
ax = plt.gca()

for i in models:
  plot_precision_recall_curve(i, X_test, y_test,ax=ax)
