In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import missingno as msno

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
import sys
from scipy import stats


In [None]:
from google.colab import files
uploaded = files.upload()

Saving customer_churn_dataset-testing-master.csv to customer_churn_dataset-testing-master.csv
Saving customer_churn_dataset-training-master.csv to customer_churn_dataset-training-master.csv


In [None]:
testDF = pd.read_csv('customer_churn_dataset-testing-master.csv').copy()
trainDF = pd.read_csv('customer_churn_dataset-training-master.csv').copy()

df = pd.concat([trainDF, testDF])
df.dropna(axis = 0 , inplace = True)
df.isna().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

In [None]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


# **FEATURE ADDITIONS**

In [None]:
df['Total Spend / Age'] = df['Total Spend'] / df['Age']
df['Age / Usage Frequency'] = df['Age'] / df['Usage Frequency']
df['Total Spend / Usage Frequency'] = df['Total Spend'] / df['Usage Frequency']
df['Support Calls/ Age']  = df['Support Calls'] / df['Age']


In [None]:
df.columns

Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency',
       'Support Calls', 'Payment Delay', 'Subscription Type',
       'Contract Length', 'Total Spend', 'Last Interaction', 'Churn',
       'Total Spend / Age', 'Age / Usage Frequency',
       'Total Spend / Usage Frequency', 'Support Calls/ Age'],
      dtype='object')



## Why Features Added

### Feature 1: Total Spend / Age
I added the feature `Total Spend / Age` to understand how a customer's spending behavior varies with age. This feature can help capture the relationship between a customer’s age and their spending patterns, potentially revealing insights about different age groups' likelihood to churn.

### Feature 2: Age / Usage Frequency
The `Age / Usage Frequency` feature was introduced to observe how frequently different age groups use the service. By combining age and usage frequency, I aim to identify if older or younger customers are more engaged with the service, which can be a strong indicator of their satisfaction and likelihood to churn.

### Feature 3: Total Spend / Usage Frequency
This feature, `Total Spend / Usage Frequency`, was added to measure the average spending per usage instance. It provides insights into the value each usage brings and helps to identify high-value customers who might be more crucial to retain.

### Feature 4: Support Calls / Age
By adding `Support Calls / Age`, I wanted to examine if there is a correlation between the age of customers and the number of support calls they make. This can reveal if certain age groups require more support, which might affect their satisfaction and churn rate.

## Why These Features Were Added
### Improved Predictive Power
These new features are designed to capture more nuanced relationships within the data, potentially improving the performance of predictive models. For instance, combining age with spending or usage frequency could highlight patterns that single features alone might miss, leading to more accurate churn predictions.

### Addressing Multicollinearity
Another reason for creating these features is to address multicollinearity among the existing features. By transforming the data, I can reduce redundancy and improve the stability and interpretability of the models.

### Supporting Customer Segmentation
These features also aid in better customer segmentation. Understanding how different segments (e.g., by age or spending habits) behave can help in tailoring personalized retention strategies.

---



In [None]:
df.nunique()

CustomerID                       442211
Age                                  48
Gender                                2
Tenure                               60
Usage Frequency                      30
Support Calls                        11
Payment Delay                        31
Subscription Type                     3
Contract Length                       3
Total Spend                       68363
Last Interaction                     30
Churn                                 2
Total Spend / Age                247065
Age / Usage Frequency               999
Total Spend / Usage Frequency    229852
Support Calls/ Age                  342
dtype: int64

In [None]:
cats = list(df.select_dtypes(include=['object']).columns)
nums = list(df.select_dtypes(exclude=['object']).columns)
cats

['Gender', 'Subscription Type', 'Contract Length']

In [None]:
nums.remove('Churn')

In [None]:
nums

['CustomerID',
 'Age',
 'Tenure',
 'Usage Frequency',
 'Support Calls',
 'Payment Delay',
 'Total Spend',
 'Last Interaction',
 'Total Spend / Age',
 'Age / Usage Frequency',
 'Total Spend / Usage Frequency',
 'Support Calls/ Age']

In [None]:
fcols = cats + nums


In [None]:
x = df[fcols]
y = df['Churn']

In [None]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505206 entries, 0 to 64373
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Gender                         505206 non-null  object 
 1   Subscription Type              505206 non-null  object 
 2   Contract Length                505206 non-null  object 
 3   CustomerID                     505206 non-null  float64
 4   Age                            505206 non-null  float64
 5   Tenure                         505206 non-null  float64
 6   Usage Frequency                505206 non-null  float64
 7   Support Calls                  505206 non-null  float64
 8   Payment Delay                  505206 non-null  float64
 9   Total Spend                    505206 non-null  float64
 10  Last Interaction               505206 non-null  float64
 11  Total Spend / Age              505206 non-null  float64
 12  Age / Usage Frequency          50520

In [None]:
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Churn, dtype: float64

In [None]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn,Total Spend / Age,Age / Usage Frequency,Total Spend / Usage Frequency,Support Calls/ Age
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0,31.066667,2.142857,66.571429,0.166667
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0,8.569231,65.0,557.0,0.153846
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0,3.363636,13.75,46.25,0.109091
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0,6.827586,2.761905,18.857143,0.12069
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0,26.826087,1.15,30.85,0.217391


# **ANOVA TEST**

In [None]:
data = []
cols = ['Features', 'F', 'P', 'Rejected']

In [None]:
def anova_helper(df, num_col, cat_col='Churn'):
    uniques = df[cat_col].unique()
    data = []
    for col in uniques:
        cur_df = (df[cat_col] == col)
        cur_df = df[cur_df][num_col]
        data.append(cur_df.tolist())

    f, p = stats.f_oneway(*data)
    return f, p

In [None]:
for feature in nums:
    f_statistic, p = anova_helper(df, feature)
    row = [feature, f_statistic, p, p < 0.05]
    data.append(row)
anova_table = pd.DataFrame(data=data, columns=cols)
anova_table

Unnamed: 0,Features,F,P,Rejected
0,CustomerID,370825.877797,0.0,True
1,Age,19174.768633,0.0,True
2,Tenure,229.47968,7.947464000000001e-52,True
3,Usage Frequency,1437.316654,4.542258e-314,True
4,Support Calls,183599.530044,0.0,True
5,Payment Delay,61649.955652,0.0,True
6,Total Spend,79972.870526,0.0,True
7,Last Interaction,8301.886679,0.0,True
8,Total Spend / Age,64222.75091,0.0,True
9,Age / Usage Frequency,4117.773504,0.0,True


According to this table, it can be observed that all of the numerical features prior and recently added for analysis are statistically significant in the prediction of Churn according to the ANOVA test with an alpha value of. 0.05. Thus, none will be eliminated for feature selection with the ANOVA test.

# **Categorical Features Analysis with Chi-Squared Test**


In [None]:

data = []
cols = ['Features', 'P', 'DOF', 'Chi2', 'Rejected']

In [None]:
for feature in cats:
    cross_tab = pd.crosstab(df[feature], y).values
    chi2, p, dof, expected_values = stats.chi2_contingency(cross_tab)
    row = [feature, p, dof, chi2, p < 0.05]
    data.append(row)

In [None]:
chi_table = pd.DataFrame(data=data, columns=cols)
chi_table

Unnamed: 0,Features,P,DOF,Chi2,Rejected
0,Gender,0.0,1,14426.685322,True
1,Subscription Type,3.9544649999999996e-41,2,186.062287,True
2,Contract Length,0.0,2,67861.64665,True


In observation of this table, it can be observed that all of the categorical features are statistically significant according to the Chi-Square test with an alpha value of 0.05. Thus, we will not eliminate any of the features. In this table Contract_Length has the highst Chi-Square value.

In [None]:
df.columns

Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency',
       'Support Calls', 'Payment Delay', 'Subscription Type',
       'Contract Length', 'Total Spend', 'Last Interaction', 'Churn',
       'Total Spend / Age', 'Age / Usage Frequency',
       'Total Spend / Usage Frequency', 'Support Calls/ Age'],
      dtype='object')

In [None]:
nums = list(df.select_dtypes(exclude=['object']).columns)
nums

['CustomerID',
 'Age',
 'Tenure',
 'Usage Frequency',
 'Support Calls',
 'Payment Delay',
 'Total Spend',
 'Last Interaction',
 'Churn',
 'Total Spend / Age',
 'Age / Usage Frequency',
 'Total Spend / Usage Frequency',
 'Support Calls/ Age']

In [None]:
from scipy.stats import pearsonr
for column in nums:
    if column == 'Churn':
        continue
    p_value_pearson = pearsonr(df[column], df['Churn'])[1]
    if p_value_pearson < 0.05:
        print(f"{column}: {p_value_pearson}")

CustomerID: 0.0
Age: 0.0
Tenure: 7.947463646610027e-52
Usage Frequency: 4.542257551e-314
Support Calls: 0.0
Payment Delay: 0.0
Total Spend: 0.0
Last Interaction: 0.0
Total Spend / Age: 0.0
Age / Usage Frequency: 0.0
Total Spend / Usage Frequency: 2.3167243928926086e-218
Support Calls/ Age: 0.0


In [None]:
df.columns

Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency',
       'Support Calls', 'Payment Delay', 'Subscription Type',
       'Contract Length', 'Total Spend', 'Last Interaction', 'Churn',
       'Total Spend / Age', 'Age / Usage Frequency',
       'Total Spend / Usage Frequency', 'Support Calls/ Age'],
      dtype='object')

In [None]:
from sklearn.feature_selection import mutual_info_classif
x = df.select_dtypes(exclude=['object']).drop('Churn', axis=1).drop('CustomerID', axis = 1)
y = df['Churn']
mi = mutual_info_classif(x, y)
sorted_mi = sorted(mi, reverse=True)
top_features = x.columns[mi >= sorted_mi]3
top_features

Index(['Total Spend', 'Total Spend / Age', 'Age / Usage Frequency',
       'Total Spend / Usage Frequency', 'Support Calls/ Age'],
      dtype='object')

The column "Churn" was dropped since that is the y-value and variable of necessary prediction. The column "CustomerID" was also dropped since it intuitively has no correlation with the Churn of a subscription model, thus it would simply increase the bias.

In the code above mutual_info_classif is used as a metric for feature selection. Mutual_info_clasif is a function from the sklearn.feature_selection module in Python's Scikit-learn library. It is used to estimate the mutual information between each feature and the target variable. Mutual information is a measure of the dependency between two variables. It quantifies the amount of information obtained about one variable through the other variable. The primary aim of mutual_info_classif is for measuring the correlation between features and the specific target variable with higher values indicating more correlation. Thus, the mutual info classified values were reverse sorted for highest correlation, producing the highest correlated features with Churn which will be used for training and testing.



