In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Context
"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs." [IBM Sample Data Sets]

Content
Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

The data set includes information about:

Customers who left within the last month – the column is called Churn
Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
Demographic info about customers – gender, age range, and if they have partners and dependents
Inspiration
To explore this type of models and learn more about the subject.



# Steps

* Exploratory Data Analysis


In [2]:
df = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

1. The Data Set has 20 features 
2. As per the problem case we dont need customer ID for now

In [3]:
# Removing the CustomerID Column form the data Frame
df.drop("customerID", axis=1, inplace = True)  
df.head()

In [4]:
df.dtypes

In [5]:
df.describe()

In [6]:
df.info()

In [7]:
df.TotalCharges.values


In [8]:
df.MonthlyCharges.values

* We can see that 'TotalCharges' is in string type while 'MonthlyCharges' are in floating point type
* So naturally its necessary to change 'TotalCharges' column into Numeric type 

In [9]:
pd.to_numeric(df.TotalCharges, errors='coerce').isnull()

In [10]:
df[pd.to_numeric(df.TotalCharges, errors='coerce').isnull()]

# This will show us the rows which contains Null values in 'TotalCharges'

In [11]:
df[pd.to_numeric(df.TotalCharges, errors='coerce').isnull()].shape

* There are on;ly 11 rows which are Null out of 7043 rows
* To keep things simple we can just drop these rows because its comprises a mere 0.15% of the total rows


In [12]:
df1 = df[df.TotalCharges !=' ']  ## Droping the Rows which are null 
df1.shape

In [13]:
df1.TotalCharges = pd.to_numeric(df1.TotalCharges) # Storing the converted numeric column into df1

In [14]:
df1.TotalCharges
df1.info()

# Time for some quick Visualization

## Features to consider for Visualization

 0. gender            
 1. SeniorCitizen     
 2. Partner         
 3. Dependents        
 4. tenure           
 5. PhoneService      
 6. MultipleLines      
 7. InternetService   

### Lets Consider Tenure first

> A tenure of a customer can say about their Loyality towards a service provider

* So lets break down Tenure into Churn & no churn, 2 seperate data frames
* Where, 'No' means the customer is leaving while 'Yes' means the customer is not leaving



In [15]:
tenure_churn_no = df1[df1.Churn == 'No'].tenure
tenure_churn_yes = df1[df1.Churn == 'Yes'].tenure

In [16]:
# Lets PLot a simple histogram on Cutomer Tenure


plt.figure(figsize=(8,8))
plt.xlabel('Tenure')
plt.ylabel('No. of customer')
plt.title('Histogram of Customer Churn')
plt.hist([tenure_churn_yes,tenure_churn_no], color = ['green','orange'], label = ['Churn = Yes', 'Churn = No'])
plt.legend()

* We can observe that the maximum number of customers which have high churn rate is higher where the Tenure is more than 65 months

In [17]:
# Lets PLot a simple histogram on Monthly Charges


Mon_Charge_no = df1[df1.Churn == 'No'].MonthlyCharges
Mon_Charge_yes = df1[df1.Churn == 'Yes'].MonthlyCharges

plt.figure(figsize=(8,8))
plt.xlabel('Tenure')
plt.ylabel('No. of customer')
plt.title('Histogram of Customer Churn')
plt.hist([Mon_Charge_yes,Mon_Charge_no], color = ['green','orange'], label = ['Churn = Yes', 'Churn = No'])
plt.legend()

**Have to create more graphs**

# Data Cleaning Phase

## Lets create a function which will return all the Unique Column values of a DataFrame 

In [18]:
def print_unique_col_values(df):
    
    for column in df:
        if df[column].dtypes =='object':
            print(f'{column} : {df[column].unique()}')
    

* Passing the previously modified DataFrame df1 in this function

In [19]:
print_unique_col_values(df1)

### Observation :

* We can see the categorical values in the data frame
* We observe that some features has values like **'No internet service' ** which can be assumed as a simple  **'No'** 
* What we can do is to replace these 'No internet service' to 'No'

In [20]:
df1.replace('No internet service', 'No', inplace=True)
df1.replace('No phone service', 'No', inplace=True)

In [21]:
print_unique_col_values(df1) ## We call the above function to make sure that the replacement has taken place

* Now we may want to replace all the yeses with 1s and all the Nos with 0s

In [22]:
columns_yes_no = ['Churn', 'PaperlessBilling', 'StreamingMovies', 'StreamingTV', 
                  'TechSupport', 'DeviceProtection',
                  'OnlineBackup', 'OnlineSecurity', 'MultipleLines',
                  'PhoneService','Dependents', 'Partner', 'gender']

for col in columns_yes_no:
    df1[col].replace({'Yes': 1, 'No' : 0}, inplace = True)

### Replaceing Gender with F = 1 & Male =0

In [23]:
df1['gender'].replace({'Female' :1, 'Male' :0}, inplace= True)

In [24]:
print_unique_col_values(df1)

In [25]:
df1.gender.unique()

In [26]:
df2 = pd.get_dummies(data = df1, columns=['InternetService', 'Contract', 'PaymentMethod'])
df2.columns

In [27]:
df2.sample(4) ## Looking at a sample from the DataFrame

In [28]:
df2.dtypes

#### We can now see that the data cleaning process ia now complete

### Next step is to scale the data.

* We can observe that the data are presnt in the dataframe in a wide range, so we need to scale and narmalize the data for furthur processing

* We can see 'Tenure', 'Monthly Charges' and 'Total charges' are not represented by 1 & 0


In [29]:
cols_to_scale = ['tenure','MonthlyCharges','TotalCharges']
scaler = MinMaxScaler()
df2[cols_to_scale] = scaler.fit_transform(df2[cols_to_scale])
df2.head()

## Now all the preprocessing for the DataFrame is now complete

## Lets split the data into training and test set 

In [30]:
X = df2.drop('Churn', axis = 'columns')
y = df2['Churn']

In [31]:



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =1)

X_train.shape


## Importing TensorFlow libraries


In [32]:
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([keras.layers.Dense(20, input_shape = (26,), activation ='relu'),
                          keras.layers.Dense(1, activation ='sigmoid'),
                           ])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100)

In [33]:
model.evaluate(X_test,y_test)

In [34]:
yp = model.predict(X_test)
yp[:5]

In [35]:
y_test

In [36]:
y_pred = []

for element in yp:
    if element > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [37]:
y_pred

## Model Metrics

In [38]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test,y_pred))

In [40]:
import seaborn as sns
cm = tf.math.confusion_matrix(labels = y_test, predictions = y_pred )


plt.figure(figsize=(8,8))
sns.heatmap(cm, annot = True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("truth")
