<center><h1>Churn Prediction</h1></center>

## 3.1 Churn Prediction Project

- Dataset: https://www.kaggle.com/datasets/blastchar/telco-customer-churn
- https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv

## 3.2 Churn Preparation

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [None]:
!wget $data -O data-week-3.csv

In [None]:
df = pd.read_csv('data-week-3.csv')
df.head()

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
# To solve the null issue, in the TotalCharges column, we can convert the column to numeric,
# setting errors='coerce' to convert invalid parsing to NaN, and then fill those NaN values with 0.
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors = 'coerce')
df.TotalCharges = df.TotalCharges.fillna(0)

In [None]:
# Make the variable as categorical type
df['SeniorCitizen'] = df['SeniorCitizen'].astype('object')

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    if df[c].dtype == 'object':
        df[c] = df[c].astype(str).str.lower().str.replace(' ', '_')

In [None]:
df.churn = (df.churn == 'yes').astype(int)

In [None]:
df.info()

## 3.3 Setting up the Validation Framework

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_test_split?

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [None]:
len(df_full_train), len(df_test)

In [None]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [None]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

## 3.4 EDA

In [None]:
df_full_train = df_full_train.reset_index(drop = True)
df_full_train

In [None]:
df_full_train.isnull().sum()

In [None]:
df_full_train.churn.value_counts(normalize = True)

In [None]:
global_churn_rate = df_full_train.churn.value_counts(normalize = True).values[1]
round(global_churn_rate, 2)

In [None]:
df_full_train.info()

In [None]:
num_vars = list(df_full_train.select_dtypes(include=['int64', 'float64']).columns)

cat_vars = [
    c for c in df_full_train.select_dtypes(include=['object']).columns
    if c != 'customerid'
]

print("Numerical variables:", num_vars)
print("Categorical variables:", cat_vars)

In [None]:
df_full_train[cat_vars].nunique()

## 3.5 Feature Importance: Churn Rate and Risk Ratio

### Churn Rate

In [None]:
df_full_train.head()

In [None]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

In [None]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

In [None]:
global_churn = df_full_train.churn.mean()
global_churn

In [None]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner

In [None]:
global_churn - churn_partner

In [None]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

In [None]:
global_churn - churn_no_partner