# Vodafone Telecommunication : Customer Churn Prediction

## Importing Libraries

In [7]:
# For loading data and related works
import numpy as np
import pandas as pd

# For controlling warnings
import warnings

# For data visualizatioon
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# For connecting to the database
import pyodbc
from dotenv import load_dotenv, dotenv_values

# For stating hypothesis
import scipy.stats as stats
from scipy.stats import chi2_contingency

# For feature encoding
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# For data balancing
from imblearn.over_sampling import SMOTE

# For data spliting
from sklearn.model_selection import train_test_split

# For machine learning model
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

# For evaluating model results
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score

# For hyperparameter tunning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# For operating system
import os
import pickle

warnings.filterwarnings('ignore')

### First Dataset : Loading training data from database

In [2]:
#pip install python-dotenv

In [3]:
#%%writefile .gitignore
#env_var = dotenv_values(".env")

In [8]:
#Load environment variables from .env file
env_var = dotenv_values('.env')

load_dotenv()

False

In [6]:
# Get the credentials from .env file
server = env_var.get("SERVER")
database = env_var.get("DATABASE")
username = env_var.get("USERNAME")
password = env_var.get("PASSWORD")

connection_string = f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"

# Connecting to the DB
connection = pyodbc.connect(connection_string)

# Selecting the Table
query = "Select * from dbo.LP2_Telco_churn_first_3000"
data_from_database = pd.read_sql(query, connection)

# Preview the data
data_from_database.head()

OperationalError: ('08001', '[08001] [Microsoft][ODBC SQL Server Driver][DBNETLIB]SQL Server does not exist or access denied. (17) (SQLDriverConnect); [08001] [Microsoft][ODBC SQL Server Driver][DBNETLIB]ConnectionOpen (Connect()). (53)')

In [None]:
data_from_database.shape

In [None]:
data_from_database = pd.read_csv("C:/Users/hp/Documents/GitHub/Customer-Churn-ML-Prediction/Datasets/data_from_database.csv")

### Second Dataset : Loading testing data

In [None]:
test = pd.read_excel("C:/Users/hp/Documents/GitHub/Customer-Churn-ML-Prediction/Datasets/Telco-churn-second-2000.xlsx")

In [None]:
test.shape

In [None]:
test.head()

### Third Dataset : Loading Training data 

In [None]:
data_from_github = pd.read_csv("C:/Users/hp/Documents/GitHub/Customer-Churn-ML-Prediction/Datasets/LP2_Telco-churn-last-2000.csv")

In [None]:
data_from_github.shape

In [None]:
data_from_github.head()

# Cleaning Testing data

In [None]:
test.shape

In [None]:
test.PhoneService.dtype == "object"

#### Function to check columns

In [None]:
def column_checker(data):
    for col in test.columns:
        print(col , " => " ,  data[col].unique())
column_checker(test)

In [None]:
test.isnull().any()

In [None]:
test.info()

#### function to replace column values

In [None]:
# Replace values in multiple columns
columns_to_replace = ['MultipleLines', 'PhoneService', 'DeviceProtection', 'InternetService', 
                      'OnlineSecurity', 'OnlineBackup', 'StreamingTV', 'TechSupport', 'StreamingMovies',
                      'SeniorCitizen', 'Partner', 'Dependents', 'PaperlessBilling']

# Define the mapping of values to replace
value_mapping = {
    True: 'Yes',
    False: 'No',
    np.nan: 'No',
    '': 'No',
    'NaN': 'No',
    'Nan': 'No',
    'NaN': 'No',
    0: 'No',
    1: 'Yes',
}

In [None]:
value_mapping

In [None]:
# check the type for the monthly charges
test.MonthlyCharges.dtype == "float64"

In [None]:
# Loop through each column and replace the values
def column_replacer(data):
    for column in columns_to_replace:
        if data[column].dtype == "float64" or data[column].dtype == "int64":
            data[column] = data[column].astype(str) # changing to string
        data[column] = data[column].replace(value_mapping)
column_replacer(test)

In [None]:
column_checker(test)


Our testing data has no null values

## Handling column "TotalCharges"

In [None]:
test.TotalCharges = test.TotalCharges.astype(str)


In [None]:
# replacing values

test.TotalCharges = test.TotalCharges.apply(lambda x : str(x).replace('',"0"))
test.TotalCharges = test.TotalCharges.apply(lambda x : str(x).replace("0.0","0"))
test.TotalCharges = test.TotalCharges.apply(lambda x : str(x).replace("0 0","0"))
test.TotalCharges = test.TotalCharges.apply(lambda x : str(x).replace("000 000","0"))
test.TotalCharges = test.TotalCharges.apply(lambda x : str(x).replace('0000000 0000000',"0"))


In [None]:
# Change type to float
test.TotalCharges = test.TotalCharges.astype(float)

In [None]:
test.info()

In [None]:
test.head()

# Cleaning Training data

Before applying cleaning we need to concatenate the two datasets in to one data

In [None]:
# Merge the datasets together to form the training data
training = pd.concat([data_from_database , data_from_github])

In [None]:
training.head()

In [None]:
training.shape

In [None]:
#check for null values
training.isnull().any()

As Column 1 is not relevant to our data we drop it

In [None]:
# Check the unique values of each column
column_checker(training)

In [None]:
# Replacing Values
column_replacer(training)

In [None]:
column_checker(training)

## Handling column "TotalCharges"

In [None]:
# dropping null values 
a = training["TotalCharges"].dropna() 

In [None]:
# changing to string to replace non int values
a = a.astype(str) 

In [None]:
# replacing values

a = a.apply(lambda x : x.replace('',"0"))
a = a.apply(lambda x : x.replace("0 0","0"))

In [None]:
# Changing to float
a = a.astype(float)

# Getting the mean value
my_mean = a.mean()

# Filling the null values by mean
training["TotalCharges"] = training["TotalCharges"].fillna(my_mean)

#Checking for null values
training["TotalCharges"].isnull().sum()

In [None]:
# Replacing values
training["TotalCharges"] = training["TotalCharges"].apply(lambda x : str(x).replace(" ","0"))
training["TotalCharges"] = training["TotalCharges"].astype(float)

## Handling column "Churn"

In [None]:
training["Churn"].unique()

In [None]:
training["Churn"] = training["Churn"].astype(str)

In [None]:
#  replacing values

training["Churn"] = training["Churn"].apply(lambda x: x.replace("False","No"))
training["Churn"] = training["Churn"].apply(lambda x: x.replace('True',"Yes"))
training["Churn"] = training["Churn"].apply(lambda x: x.replace("nan","No"))
training["Churn"] = training["Churn"].apply(lambda x: x.replace("None","No"))

In [None]:
training.Churn.unique()

In [None]:
training.isnull().sum()

In [None]:
training.info()

In [None]:
training.head()

Now our training data is cleaned

# Stating hypothesis

## 1. Hypothesis

Null Hypothesis : There is no significant effect of the tenure period of customers and the churn rate of customers
    
Alternative Hypothesis : There is a significant effect of the tenure period of customers and the churn rate of customers

In [None]:
# Using the 'churn' and 'tenure' columns in our Data
df = training[['Churn', 'tenure']]

# Create a contingency table
contingency_table = pd.crosstab(df['Churn'], df['tenure'])

# Perform the chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Chi-square statistic:", chi2)
print("P-value:", p_value)
print("Degrees of freedom:", dof)
#print("Expected frequencies:\n", expected)

print()

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("There is a statistically significant relationship between churn and tenure.")
    print("We can reject the null hypothesis.")
else:
    print("There is no statistically significant relationship between churn and tenure.")
    print("We fail to reject the null hypothesis.")


## 2. Hypothesis

Null Hypothesis : There is no significant effect of the contract type of customers and the churn rate of customers
    
Alternative Hypothesis : There is a significant effect of the contract type of customers and the churn rate of customers

In [None]:
df = training[['Churn', 'Contract']]

# Create a contingency table
contingency_table = pd.crosstab(df['Churn'], df['Contract'])

# Perform the chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Chi-square statistic:", chi2)
print("P-value:", p_value)
print("Degrees of freedom:", dof)
#print("Expected frequencies:\n", expected)

print()

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("There is a statistically significant relationship between churn and Contract.")
    print("We can reject the null hypothesis.")
else:
    print("There is no statistically significant relationship between churn and Contract.")
    print("We fail to reject the null hypothesis.")

## 3. Hypothesis

Null Hypothesis : There is no significant effect of the gender of customers and the churn rate of customers
    
Alternative Hypothesis : There is a significant effect of the gender of customers and the churn rate of customers

In [None]:
# Getting the churn and gender columns
df = training[['Churn', 'gender']]

# Create a contingency table
contingency_table = pd.crosstab(df['Churn'], df['gender'])

# Perform the chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Chi-square statistic:", chi2)
print("P-value:", p_value)
print("Degrees of freedom:", dof)
#print("Expected frequencies:\n", expected)

print()

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("There is a statistically significant relationship between churn and gender.")
    print("We can reject the null hypothesis.")
else:
    print("There is no statistically significant relationship between churn and gender.")
    print("We fail to reject the null hypothesis.")

### According to our result we can conclude that there is statistically significant relationship between churn and tenure then between churn and contract but none between churn and gender

## 4. Hypothesis

Null Hypothesis : There is no significant effect of being senior citizen for a company to have more churn customers

Alternative Hypothesis : There is significant effect of being senior citizen for a company to have more churn customers

In [None]:
# Getting the Senior Citizen data 
citizen = training.SeniorCitizen

# Replace Values
citizen = citizen.apply(lambda x: str(x).replace("No", "0"))
citizen = citizen.apply(lambda x: str(x).replace("Yes", "1"))

# Convert to integer
citizen = citizen.astype(int)

# Getting the Churn data 
churn = training.Churn

# Replace Values
churn = churn.apply(lambda x: str(x).replace("No", "0"))
churn = churn.apply(lambda x: str(x).replace("Yes", "1"))

# Convert to integer
churn = churn.astype(int)

# Results
print(stats.shapiro(churn))
print(stats.levene(churn, citizen))

# Two sample t-test
t_stat, p_value = stats.ttest_ind(citizen, churn)
print("T_Stat:", t_stat)
print("P_Value:", p_value)

print()

#Interpret Result
alpha = 0.05
if p_value < alpha:
    print("Reject Null Hypothesis, P_Value is less than 0.05, so there is a significant effect")
else:
    print("Failed to reject Null Hypothesis, P_Value is greater than 0.05, so there is no significant effect")

### According to our result we can conclude that there is an effect on being a senior citizen customer or not for the company to lose its users. We recommend the organization to look for senior citiznes to be its users.

In [None]:
seniorcitizen = training[training.SeniorCitizen == "Yes"]
leave_senior = seniorcitizen[seniorcitizen.Churn == "Yes"]["Churn"].count()
not_seniorcitizen = training[training.SeniorCitizen == "No"]
not_seniorcitizen = not_seniorcitizen[not_seniorcitizen.Churn == "Yes"]["Churn"].count()
list_citizen = ["SeniorCitizen", "Not SeniorCitizen"]
list_num = [leave_senior, not_seniorcitizen]

sns.barplot(x = list_num, y= list_citizen )

# Show plot

plt.title("Number of Customers that left the Company")
plt.xlabel("Count")
plt.ylabel("Citizenship")
plt.show()

# Questions to be Answered

1. How is the gender distributed?
2. How many customers are senior citizen?
3. How many customers are using different types of services from the company?
4. How many customers churned from the company?
5. What is the Average Monthly charge?
6. What is the Average of the total charges the company gained?
7. What type of Internet service is used by customers?

### 1. How is the gender distributed?

In [None]:
# get the total of male and female
male = ( training["gender"] == "Male" ).sum()
female = ( training["gender"] == "Female" ).sum()

# data labels
data = [male, female]
labels = ['Male', "Female"]

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:5]


#create pie chart
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Gender Distribution')
plt.show()

The gender distribution has no big difference, The data is balanced with gender

## 2. How many customers are senior citizen?

In [None]:
training.SeniorCitizen.unique()

In [None]:
# Total count of Senior Citizens
training.SeniorCitizen.count()

In [None]:
# Create Chart
plt.figure(figsize=(10, 8))
sns.countplot(x="SeniorCitizen", data=training)

plt.style.use("fivethirtyeight")
plt.title('Senior Citizen Customers')
plt.xlabel('SeniorCitizen')
plt.ylabel('Count')
plt.show()


More than 4000 customers are not senior Citizens and 800 are seniors. This shows a high variance.

## 3. How many customers are using different types of services from the company?

In [None]:
# Get the total number of customers using each service
MultipleLines = ( training["MultipleLines"] == "Yes" ).sum()
TechSupport = ( training["TechSupport"] == "Yes" ).sum()
OnlineSecurity = ( training["OnlineSecurity"] == "Yes" ).sum()
DeviceProtection = ( training["DeviceProtection"] == "Yes" ).sum()
OnlineBackup = ( training["OnlineBackup"] == "Yes" ).sum()
StreamingTV = ( training["StreamingTV"] == "Yes" ).sum()
Churn = ( training["Churn"] == "Yes" ).sum()

In [None]:
# Display the output
print("MultipleLines has",MultipleLines, "customers")
print("TechSupport has",TechSupport, "customers")
print("OnlineSecurity has",OnlineSecurity, "customers")
print("DeviceProtection has",DeviceProtection, "customers")
print("OnlineBackup has",OnlineBackup, "customers")
print("StreamingTV has",StreamingTV, "customers")
print("There are",Churn, "customers that have left the company")

In [None]:
list_item = ["MultipleLines", "TechSupport", "OnlineSecurity", "DeviceProtection", "OnlineBackup", "StreamingTV", "Churn"]
list_num = [MultipleLines, TechSupport, OnlineSecurity, DeviceProtection, OnlineBackup, StreamingTV, Churn]
list_item

In [None]:
training.StreamingTV

In [None]:
# Plot chart
plt.figure(figsize=(10, 6))
sns.barplot(x = list_num, y= list_item)
plt.title('Customers in Different Service Types')
plt.ylabel('Service Types')
plt.xlabel('Customers')
# Show plot
plt.show()


## 4. How many customers churned from the company?

In [None]:
# Total customers that left and stayed
Leave = ( training["Churn"] == "Yes" ).sum()
Stay = ( training["Churn"] == "No" ).sum()

data = [Stay, Leave]
labels = ['Stay', "Leave"]

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:5]

#create pie chart
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title("Percentage of Customers in company")
plt.show()

The company's data shows that 75% of customers are leaving. More than 50% of them have no internet service, techsupport and other benefits from the organization

## 5. What is the Average Monthly charge?

In [None]:
# Average monthly charge
mean  = training["MonthlyCharges"].mean()
print("Averge monthly charge is : ", round(mean,2))

## 6. What is the Average of the total charges the company gained?

In [None]:
# Getting the average total charges
training.TotalCharges =  training.TotalCharges.astype(float)
total  = training["TotalCharges"].mean()
print("Averge Total charge is : ", round(total,2))

## 7. What type of Internet service is used by customers?

In [None]:
# Checking the Internet service column
training.InternetService

In [None]:
# Create plot
plt.figure(figsize=(10, 6))
sns.countplot(x="InternetService", data=training)

plt.style.use("fivethirtyeight")

plt.title("Internet Service Type")
plt.xlabel("Service type")
plt.ylabel("Amount")
plt.show()

# Checking for outliers

In [None]:
# Service usage - Monthly Charges
# Examining the Average monthly charges with a box plots
plt.figure(figsize=(10, 6))
sns.boxplot(x='MonthlyCharges', data=training)
plt.title('Monthly Charges')
plt.xlabel('Charges')
plt.show()

The box plot for monthly charges gives an overview of the distribution of charges among customers. It shows the range, median, quartiles, and any potential outliers in the monthly charges. With this, we can see that the average is around 70 with 20 being the lowest and 120 the highest charge per month. 

In [None]:
# Service usage - Monthly Charges
# Examining the Average monthly charges with a box plots
plt.figure(figsize=(10, 6))
sns.boxplot(x='TotalCharges', data=training)
plt.title('Total Charges')
plt.xlabel('Charges')
plt.show()

There is too much outlier in the column TotalCharge. So for the sake of good machine learning model performance it needs to be corrected

In [None]:
# Handling outliers
training.drop(training.loc[training['TotalCharges'] > 10000].index, inplace=True)  

In [None]:
# Service usage - Monthly Charges
# Examining the Average monthly charges with a box plots
plt.figure(figsize=(10, 6))
sns.boxplot(x='TotalCharges', data=training)
plt.title('Total Charges')
plt.xlabel('Charges')
plt.show()

## Univariate Analysis

#### Univariate analysis focuses on examining individual variables in isolation. For customer churn data, univariate analysis will involve exploring Churn Distribution, Customer Demographics and Service Usage

In [None]:
# Churn distribution
# Calculate the churn rate (percentage of customers who churned)
churn_counts = training['Churn'].value_counts()
churn_percentages = churn_counts / churn_counts.sum() * 100
print("Churn Distribution:")
print(churn_percentages)


The churn distribution shows that approximately 26.5% of customers have churned, while around 73.5% have not churned. This indicates a class imbalance, with the churned class being the minority.

In [None]:
# Customer demographics - Gender
# Analyzing the gender distributions
sns.set(style="darkgrid")
plt.figure(figsize=(10, 6))
sns.countplot(x='gender', data=training)
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()


The gender distribution analysis reveals the count of customers by gender. It shows that the gender is balanced with male slightly higher than female.

In [None]:
# Customer demographics - Senior Citizens
# Analyzing the SeniorCitizens distributions
plt.figure(figsize=(10, 6))
training['SeniorCitizen'].value_counts().plot(kind='bar')
plt.title('Senior Citizens Distribution')
plt.xlabel('Senior Citizens')
plt.ylabel('Count')
plt.show()

The gender distribution analysis reveals the count of customers by senior citizens. It shows that there are whole lot more of non-senior citizens than senior citizens.

In [None]:
# Service usage - Monthly Charges
# Examining the Average monthly charges with a box plots
plt.figure(figsize=(10, 6))
sns.boxplot(x='MonthlyCharges', data=training)
plt.title('Monthly Charges')
plt.xlabel('Charges')
plt.show()

The box plot for monthly charges gives an overview of the distribution of charges among customers. It shows the range, median, quartiles, and any potential outliers in the monthly charges. With this, we can see that the average is around 70 with 20 being the lowest and 120 the highest charge per month. 

# Bivariate and Multivariate analysis

## Bivariate Analysis

Bivariate analysis explores the relationship between two variables. In the case of customer churn data, bivariate analysis can help uncover potential correlations or dependencies between different variables and churn

In [None]:
# Churn by gender

# Analyzing the Churn Rate by Gender to observe any patterns associated with churn
PaymentMethod = training.groupby('PaymentMethod')['Churn'].value_counts(normalize=True).unstack().reset_index()
PaymentMethod.rename(columns={'No': 'No Churn', 'Yes': 'Churn'}, inplace=True)

fig = px.bar(PaymentMethod, x='PaymentMethod', y=['No Churn', 'Churn'], barmode='stack', title='Churn by PaymentMethod')
fig.show()

The stacked bar chart demonstrates the churn rates categorized by gender. It shows that there is about 80% No Churn rate in both Male and Female and about 20% Churn rate in both too, so there's no pattern in genders.

In [None]:
# Churn by Payment Method
# Analyzing the Churn Rate by Payment Method to observe any patterns associated with churn
payment_churn = training.groupby('PaymentMethod')['Churn'].value_counts(normalize=True).unstack().reset_index()
payment_churn.rename(columns={'No': 'No Churn', 'Yes': 'Churn'}, inplace=True)

fig = px.bar(payment_churn, x='PaymentMethod', y=['No Churn', 'Churn'], barmode='stack', title='Churn by Payment Method')
fig.show()

The stacked bar chart demonstrates the churn rates categorized by payment method. It shows that Electronic Checks are definitely not the way to go and should be removed.

In [None]:
# Churn and service usage - Monthly Charges
# Analyzing the Churn Rate by Monthly Charges to observe any patterns associated with churn
fig = px.scatter(training, x='MonthlyCharges', y='Churn', color='Churn', title='Churn and Monthly Charges')
fig.show()

The scatter plot visualizes the relationship between churn and monthly charges. It allows us to observe whether there is any noticeable pattern or trend between higher charges and churn. It can help identify if customers with higher monthly charges are more likely to churn. This shows that charges between 70 to 110 have a higher chance to churn and also not churn which means no noticeable patterns

In [None]:
# Churn and contract information - Contract
# Analyzing the Churn Rate by Contract to observe any patterns associated with churn
contract_churn = training.groupby('Contract')['Churn'].value_counts(normalize=True).unstack().reset_index()
contract_churn.rename(columns={'No': 'No Churn', 'Yes': 'Churn'}, inplace=True)

fig = px.bar(contract_churn, x='Contract', y=['No Churn', 'Churn'], barmode='stack', title='Churn by Contract Type')
fig.show()


The stacked bar chart showcases the churn rates based on different contract types (Month-to-month, One year and Two years). It shows that a signicantly higher percentage of customers are likely to churn with a contract type of Month-to-month.

In [None]:
# Churn and customer tenure
# Analyzing the Churn Rate by customer tenure to observe any patterns associated with churn
plt.figure(figsize=(10, 6))
sns.histplot(training, x='tenure', hue='Churn', multiple='stack', bins=20, palette='Set1')
plt.title('Churn and Customer Tenure')
plt.xlabel('Tenure (Months)')
plt.ylabel('Count')
plt.show()

The histogram displays the distribution of customer tenure for both churned and non-churned customers over their tenure in months. This shows that customer loyalty goes a very long with the customers with the highest months(70) having the highest No Churn count compared to its churned count.

In [None]:
sns.scatterplot(x = "tenure", y = "TotalCharges", data = training)
plt.title("Customer tenure and total charges")
plt.show()

## Multivariate Analysis

Multivariate analysis is a statistical technique used to analyze data with multiple variables simultaneously. It allows you to explore relationships, patterns, and associations among multiple variables in a dataset.

In [None]:
#training_data.drop('Unnamed: 0', inplace=True, axis=1)

# Correlation matrix
correlation_matrix = training.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

There is a correlation between tenure and total charges

In [None]:
# Pairplot of tenure, MonthlyCharges and TotalCharges with Churn
sns.pairplot(training, vars=["tenure", "MonthlyCharges", "TotalCharges"], hue="Churn")
plt.title("Pairplot")
plt.show()

In [None]:
# Boxplot of Churn and tenure
sns.boxplot(x="Churn", y="tenure", data=training)
plt.title("Boxplot - Churn vs Tenure")
plt.show()

There is an effect of tenure on churning customers

In [None]:
# Scatterplot of MonthlyCharges and TotalCharges with Churn
sns.scatterplot(x="MonthlyCharges", y="TotalCharges", hue="Churn", data=training)
plt.title("Scatterplot - MonthlyCharges vs TotalCharges")
plt.show()

# Feature Engineering

#### As "CustomerID" column is not relevant to our model we can drop it from both train and test data

In [None]:
test.drop(["customerID"] , axis = 1 , inplace = True) 
training.drop('customerID', inplace=True, axis=1)

In [None]:
sns.heatmap( training.corr() , annot = True )

## Correlation perspectives

Customers having a time of high tenure gives a lot of total charge to the company

In [None]:
sns.scatterplot(x = "tenure" , y = "TotalCharges" , data=training)
plt.title("Customers tenure and total charges")
plt.show()

# Features Scaling

In [None]:
target_name = "Churn"
target = training["Churn"]

data = training.drop(columns=["Churn"])

In [None]:
# selecting a column by its type

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [None]:
# assigning an encoder
categorical_preprocessor = OrdinalEncoder()
numerical_preprocessor = StandardScaler()

In [None]:
# collaborating the processing
preprocessor = ColumnTransformer(
    [
        ("ordinal-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns),
    ]
)

In [None]:
# Fitting
target_t = categorical_preprocessor.fit_transform(target.array.reshape(-1 , 1))

In [None]:
# Assign type int
target_t.astype(int)

In [None]:
# Use as dataframe
target_t = pd.DataFrame(target_t , columns = ["Churn"] )

In [None]:
target_t

In [None]:
# Count total values
target_t.value_counts()

In [None]:
# function to convert to string
def target_to_string(data):
    data = data.astype(str)
    data = data.apply(lambda x : x.replace("0.0" , "No"))
    data = data.apply(lambda x : x.replace("1.0" , "Yes"))
    
    return data

In [None]:
target_t = target_to_string(target_t)
target_t

In [None]:
# transform to fit data
data_t = preprocessor.fit_transform(data)

In [None]:
# use as dataframe
data_t = pd.DataFrame(data_t  ,columns = list(data.columns))

In [None]:
data_t.head()

# Balancing the data

In [None]:
target_t.value_counts()

In [None]:
# Plotting a graph of the target column's values
a = target_t.value_counts()

x = list(a.index)
y = list(a)

fig, ax = plt.subplots()    
width = 0.75 # the width of the bars 
ind = np.arange(len(y))  # the x locations for the groups
ax.barh(ind, y, width, color="blue")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(x, minor=False)
for i, v in enumerate(y):
    ax.text(v + .25, i + .25, str(v), color='blue', fontweight='bold') #add value labels into bar
plt.title('Customer Churn')
plt.ylabel('Status')
plt.xlabel('Number of Customers')
plt.show()

### The above graph shows that the data on our target column is not balanced

In [None]:
# Balancing using Oversampling Technique (SMOTE) which stands for Synthetic Minority Oversampling Technique
nm = SMOTE()
data_balanced_t , target_balanced_t = nm.fit_resample(data_t , target_t)
target_balanced_t.value_counts()

In [None]:
# Plotting the graph to see the balanced data
a = target_balanced_t.value_counts()

x = list(a.index)
y = list(a)

fig, ax = plt.subplots()    
width = 0.75 # the width of the bars 
ind = np.arange(len(y))  # the x locations for the groups
ax.barh(ind, y, width, color="blue")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(x, minor=False)
for i, v in enumerate(y):
    ax.text(v + .25, i + .25, str(v), color='blue', fontweight='bold') #add value labels into bar
plt.title('Customer Churn')
plt.ylabel('Status')
plt.xlabel('Number of Customers')
plt.show()

In [None]:
print("Independet Data is : ", data_balanced_t.shape , "Dependent Data is :" , target_balanced_t.shape)

### ... data balanced

# Machine Learning Modeling

In [None]:
# splitting data

data_train, data_test, target_train, target_test = train_test_split(
    data_balanced_t, target_balanced_t, random_state=42
)

## Model 1. Logistic Regression

In [None]:
model_trained = []

# assigning the module to a variable
log = LogisticRegression() 

# training the model by fitting a training data
log.fit(data_train , target_train ) 

In [None]:
model_trained.append(log)

In [None]:
# trying to make a prediction
target_predicted = log.predict(data_test)  

In [None]:
print("classification report for :" , log.__str__())
print(classification_report(y_true = target_test , y_pred = target_predicted ))

## Confusion matrix

In [None]:
# View confusion matrix
def view_confusion(y_true , target):
    cm = confusion_matrix(y_true , target)
    cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
target_predicted

In [None]:
view_confusion(target_predicted, target_test)

## Model 2. HistGradientBoostingClassifier

In [None]:
#Define the model
hist = HistGradientBoostingClassifier()

hist.fit(data_train , target_train )

In [None]:
model_trained.append(hist)

target_predicted = hist.predict(data_test)

print("classification report for :" , hist.__str__())
print(classification_report(y_true = target_test , y_pred = target_predicted ))

In [None]:
view_confusion(target_predicted, target_test)

In [None]:
## Model 3. K-Nearest Neihgbors

In [None]:
# Define the model
kn = KNeighborsClassifier(n_neighbors=5)

kn.fit(data_train , target_train )

In [None]:
model_trained.append(kn)

target_predicted = kn.predict(data_test)

print("classification report for :" , kn.__str__())
print(classification_report(y_true = target_test , y_pred = target_predicted ))

In [None]:
view_confusion(target_predicted, target_test)

## Model 4. Support Vector Machine (SVM)

In [None]:
# Define the model
sv = svm.SVC()

sv.fit(data_train , target_train)

In [None]:
model_trained.append(sv)

target_predicted = sv.predict(data_test)

print("classification report for :" , sv.__str__())
print(classification_report(y_true = target_test , y_pred = target_predicted ))

In [None]:
view_confusion(target_predicted, target_test)

## Model 5. Decision Tree Classifier

In [None]:
# Define the model
ds = DecisionTreeClassifier()

ds.fit(data_train , target_train)

In [None]:
model_trained.append(ds)

target_predicted = ds.predict(data_test)

print("classification report for :" , ds.__str__())
print(classification_report(y_true = target_test , y_pred = target_predicted ))

In [None]:
view_confusion(target_predicted, target_test)

In [None]:
## Model 6. Random Forest Classifier

In [None]:
rm = RandomForestClassifier()
rm.fit(data_train , target_train)

In [None]:
model_trained.append(rm)

target_predicted = ds.predict(data_test)

print("classification report for :" , ds.__str__())
print(classification_report(y_true = target_test , y_pred = target_predicted ))

In [None]:
view_confusion(target_predicted, target_test)

# Model Comparison

In [None]:
# The f1 score metric is used
metric = f1_score

In [None]:
# Model information
info = [ { "Model_Name" : model.__str__(),
         f"metric ( {metric.__name__} ) ": metric(
             y_true = target_test,
             y_pred = model.predict(data_test),
             pos_label = "Yes"),  # pos label shows to which prediction value it is pointing
             } for model in model_trained ]

In [None]:
# Use it as a dataframe
metric = pd.DataFrame(info).sort_values(ascending=False)

metric

This shows that the RandomForest Model has the highest f1 score of 0.859914 with the histgradient model right behind it with f1 score of 0.856989

We could actually combine these two models for higher accuracy but not in this project

# Model Evaluation using Hyperparameter Tuning and Cross Validation

In [None]:
# Model Parameters as a dictionary
model_params = {
    
    "svm" : {
        "model" : svm.SVC(gamma="auto"),
        "params" : {
            "C" : [200 , 300, 400],
            "kernel" : ["linear", "rbf"]
        }
    },
    "random_forest" : {
        "model" : RandomForestClassifier(),
        "params" : {
            "n_estimators" : [200 , 300 , 400 ]
        }
    },
    "logistic_regression" : {
        "model" : LogisticRegression(solver="liblinear" , multi_class="auto"),
        "params" : {
            "C" : [200, 300 , 400]
        }
    },
    "decision_tree" : {
        "model" : DecisionTreeClassifier(),
        "params" : {
            "criterion" : ["gini" , "entropy"],
            "splitter" : ["best" , "random"]
        }
    },
    "knn classifier" : {
        "model" : KNeighborsClassifier(),
        "params" : {
            "n_neighbors": [5 , 10 , 20],
            "weights" : ["uniform", "distance"], 
            "n_jobs" : [5 , 10 , 20]
        }
    },    
    "hist gradient classifier" : {
        "model" : HistGradientBoostingClassifier(),
        "params" : {
            "loss" : ["log_loss" , "auto" , "binary_crossentropy" , "categorical_crossentropy" ],
            "learning_rate" : [0.1 , 1 , 2 ,3],
            "max_depth" : [25 , 50 , 75 , None],
            "l2_regularization" : [0 , 1.25 , 1.5 , 2],
            "scoring" : ["f1_micro" , "loss"]
        }
    }
}

In [None]:
# Getting the best parameters
scores = []
for model_name , mp in model_params.items():
    clf = GridSearchCV(mp["model"] , mp["params"] , cv = 5 , return_train_score=False)
    clf.fit(data_train , target_train)
    scores.append({
        "model" : model_name,
        "best_score" : clf.best_score_,
        "best_params" : clf.best_params_
    })
    
scores

'''
scores = []
a = 0
for model_name , mp in model_params.items():
    a = a + 50
    clf = GridSearchCV(mp["model"] , mp["params"] , cv = 5 , return_train_score=False)
    clf.fit(data_train , target_train)
    scores.append({
        "model" : model_name,
        "best_score" : clf.best_score_,
        "best_params" : clf.best_params_
    })
    print(a , " percent")

scores

'''

In [None]:
# Use a dataframe
df = pd.DataFrame(scores).sort_values(ascending=False)
df

## The above table shows that HistGradient_classifier has the best score which is 85%.

In [None]:
hs = HistGradientBoostingClassifier()
hs.fit(data_train , target_train)

In [None]:
target_predicted = rm.predict(data_test)
print(classification_report(y_true = target_test , y_pred = target_predicted))

In [None]:
target_predicted = pd.DataFrame(target_predicted)
target_predicted

In [None]:
predicted_yes = target_predicted[target_predicted == "Yes"].count()[0]
predicted_no = target_predicted[target_predicted == "No"].count()[0]

test_yes = target_test[target_test["Churn"] == "Yes"].count()[0]
test_no = target_test[target_test["Churn"] == "No"].count()[0]

compare_yes_list = [ ["Predicted Yes" ,predicted_yes]  , ["Test Yes" , test_yes]    ] 
compare_yes_table = pd.DataFrame( compare_yes_list , columns = ["Status" , "Count"])

compare_yes_table

In [None]:
x = list(compare_yes_table["Status"])
y = list(compare_yes_table["Count"])

fig, ax = plt.subplots()    
width = 0.75 # the width of the bars 
ind = np.arange(len(y))  # the y locations for the bars

ax.barh(ind, y, height=width, color="blue")
ax.set_yticks(ind)
ax.set_yticklabels(x)
ax.set_xlabel('Amount')
ax.set_ylabel('Status')
ax.set_title('Comparison of Customer Churn ("Yes")')

for i, v in enumerate(y):
    ax.text(v + 0.25, i, str(v), color='blue', fontweight='bold') # add value labels into bars

plt.show()


In [None]:
compare_no_list = [ ["Predicted No" ,predicted_no]  , ["Test No" , test_no]    ] 
compare_no_table = pd.DataFrame( compare_no_list , columns = ["Status" , "Count"])

In [None]:
x = list(compare_no_table["Status"])
y = list(compare_no_table["Count"])

fig, ax = plt.subplots()    
width = 0.75 # the width of the bars 
ind = np.arange(len(y))  # the x locations for the groups
ax.barh(ind, y, width, color="blue")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(x, minor=False)
for i, v in enumerate(y):
    ax.text(v + .25, i + .25, str(v), color='blue', fontweight='bold') #add value labels into bar
plt.title('Comparing For Customer Churn ("No")')
plt.ylabel('Status')
plt.xlabel('Amount')
plt.show()

# Prediction for our Test data

### First we are going to scale our test data before predicting

In [None]:
test.head()

In [None]:
prepared_data = preprocessor.fit_transform(test)
prepared_data

In [None]:
churn_predicted = rm.predict(prepared_data)
test["Churn"] = churn_predicted

# concatenating the churn result
# Replacing values
test["Churn"] = test["Churn"].astype(str)
test["Churn"] = test["Churn"].apply(lambda x : x.replace("1.0" , "Yes"))
test["Churn"] = test["Churn"].apply(lambda x : x.replace("0.0" , "No"))

test.head()

In [None]:
test.to_csv("Predicted_dataset.csv")

# Prediction visualization

## 1 . Payment Method

In [None]:
payment_churn = test.groupby('PaymentMethod')['Churn'].value_counts(normalize=True).unstack().reset_index()
payment_churn.rename(columns={'No': 'No Churn', 'Yes': 'Churn'}, inplace=True)

fig = px.bar(payment_churn, x='PaymentMethod', y=['No Churn', 'Churn'], barmode='stack', title='Predicted Precent of customer churn by Payment  Method')
fig.show()

In [None]:
InternetService = test.groupby('InternetService')['Churn'].value_counts(normalize=True).unstack().reset_index()
InternetService.rename(columns={'No': 'No Churn', 'Yes': 'Churn'}, inplace=True)

fig = px.bar(InternetService, x='InternetService', y= ['No Churn' , 'Churn'] , barmode='stack', title='Predicted Precent of customer churn by Internet Service')
fig.show()

# Export key components

In [None]:
components = {
    "pipline" : preprocessor,
    "models" : model_trained
}

In [None]:
%mkdir export

In [None]:
import os
import pickle

In [None]:
destination = os.path.join("." , "export")

In [None]:
with open( os.path.join( destination , "ml.pkl" ) , "wb" ) as f :
    pickle.dump(components , f)

In [None]:
obj = pd.read_pickle(r'export/ml.pkl')

In [None]:
obj

In [None]:
pip freeze > export.requirements.txt