## GOAL: Create a model to predict whether or not a customer will Churn .

In [None]:
# IMPORTING NECESSARY PROJECTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('Telco-Customer-Churn.csv')

In [None]:
df.head()

In [None]:
# INSPECTING THE DATA
df.info()

In [None]:
# STATISTICAL SUMMARY OF DATA

In [None]:
df.describe().T

In [None]:
# EDA - EXPLORATORY DATA ANALYSIS

In [None]:
# CHECKING NULL VALUES

df.isnull().sum()

In [None]:
# PLOTTING THE TARGET VALUE (Churn) WITH A COUNT PLOT

sns.countplot(data=df, x='Churn');

In [None]:
str = "Distrbution of TotalCharges between Churn categories with a Box Plot"
str.upper()

In [None]:
## DISTRBUTION OF TOTALCHARGES BETWEEN CHURN CATEGORIES WITH A BOX PLOT

sns.violinplot(data =df, x="Churn", y="TotalCharges")

In [None]:
# BOX PLOT - DISTRIBUTION OF TOTAL CHARGES PER CONTRACT TYPE
plt.figure(figsize=(14, 8), dpi =200)
sns.boxplot(data=df, x = "Contract", y = "TotalCharges", hue='Churn')
plt.legend(loc=(1, 1))

In [None]:
# BAR PLOT SHOWING THE CORRELATION

df.columns

In [None]:
corr_df  = pd.get_dummies(df[['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod','Churn']]).corr()

In [None]:
corr_df['Churn_Yes'].sort_values().iloc[1:-1]

In [None]:
plt.figure(figsize=(14, 8), dpi=200)
sns.barplot(x = corr_df['Churn_Yes'].sort_values().iloc[1:-1].index,
           y = corr_df['Churn_Yes'].sort_values().iloc[1:-1].values)
plt.xticks(rotation=90);

In [None]:
# CHURN ANALYSIS

In [None]:
df['Contract'].unique()

In [None]:
plt.figure(figsize=(14,6), dpi = 200)
sns.histplot(data=df, x='tenure', bins=70)

In [None]:
plt.figure(figsize=(10,3),dpi=200)

sns.displot(data=df, x="tenure",bins=70, col="Contract", row="Churn");

In [None]:
# SCATTER PLOT - TOTAL CHARGES VS MONTHLY CHARGES
plt.figure(figsize=(10,4),dpi=200)
sns.scatterplot(data=df, x="MonthlyCharges", y="TotalCharges", hue='Churn',
               linewidth=0.5, palette="Dark2",alpha=0.5)

In [None]:
# CREATING COHORTS BASED ON TENURE

In [None]:
no_churn = df.groupby(['Churn','tenure']).count().transpose()['No']
yes_churn = df.groupby(['Churn','tenure']).count().transpose()['Yes']

In [None]:
churn_rate = 100 * yes_churn / (no_churn+yes_churn)

In [None]:
churn_rate.transpose()['customerID']

In [None]:
plt.figure(figsize=(10,4),dpi=200)
churn_rate.iloc[0].plot()
plt.ylabel('Churn Percentage');

In [None]:
## BROADER COHORT GROUPS

def cohort(tenure):
    if tenure < 13:
        return '0-12 Months'
    elif tenure < 25:
        return '12-24 Months'
    elif tenure < 49:
        return '24-48 Months'
    else:
        return "Over 48 Months"

In [None]:
df['Tenure Cohort'] = df['tenure'].apply(cohort)

In [None]:
df.head(10)[['tenure','Tenure Cohort']]

In [None]:
## SCATTER PLOT - TOTAL CHARGES VS MONTHLY CHARTS 

plt.figure(figsize=(10,4),dpi=200)
sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Tenure Cohort', linewidth=0.5,alpha=0.5,palette='Dark2')

In [None]:
plt.figure(figsize=(10,4),dpi=200)
sns.countplot(data=df,x='Tenure Cohort',hue='Churn')

In [None]:
plt.figure(figsize=(10,4),dpi=200)
sns.catplot(data=df,x='Tenure Cohort',hue='Churn',col='Contract',kind='count')

In [None]:
## CREATING THE MODEL 

In [None]:
# DECISION TREE

X = df.drop(['Churn', 'customerID'], axis=1)
X = pd.get_dummies(X, drop_first=True)

In [None]:
y = df['Churn']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                   random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(max_depth = 6)
dt_model.fit(X_train, y_train)



In [None]:
preds_dt = dt_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

In [None]:
print(classification_report(y_test, preds_dt))

In [None]:
imp_feats = pd.DataFrame(data=dt_model.feature_importances_,index=X.columns,columns=['Feature Importance']).sort_values("Feature Importance")

In [None]:
plt.figure(figsize=(14,6),dpi=200)
sns.barplot(data=imp_feats.sort_values('Feature Importance'),x=imp_feats.sort_values('Feature Importance').index,y='Feature Importance')
plt.xticks(rotation=90)
plt.title("Feature Importance for Decision Tree");

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(12,8),dpi=500)
plot_tree(dt_model,filled=True);

In [None]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)

preds_rf = rf.predict(X_test)

In [None]:
print(classification_report(y_test,preds_rf))

In [None]:
## BOOSTED TREES

from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier

In [None]:
ada_model = AdaBoostClassifier()

ada_model.fit(X_train,y_train)

In [None]:
preds_ab = ada_model.predict(X_test)

In [None]:
print(classification_report(y_test,preds_ab))