In [24]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
churn_df = pd.read_csv("./dataset/customer_churn_dataset-training-master.csv")

churn_df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [26]:
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440833 entries, 0 to 440832
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         440832 non-null  float64
 1   Age                440832 non-null  float64
 2   Gender             440832 non-null  object 
 3   Tenure             440832 non-null  float64
 4   Usage Frequency    440832 non-null  float64
 5   Support Calls      440832 non-null  float64
 6   Payment Delay      440832 non-null  float64
 7   Subscription Type  440832 non-null  object 
 8   Contract Length    440832 non-null  object 
 9   Total Spend        440832 non-null  float64
 10  Last Interaction   440832 non-null  float64
 11  Churn              440832 non-null  float64
dtypes: float64(9), object(3)
memory usage: 40.4+ MB


In [27]:
churn_df.isnull().sum()

CustomerID           1
Age                  1
Gender               1
Tenure               1
Usage Frequency      1
Support Calls        1
Payment Delay        1
Subscription Type    1
Contract Length      1
Total Spend          1
Last Interaction     1
Churn                1
dtype: int64

In [28]:
churn_df = churn_df.dropna()

In [29]:
from sklearn.preprocessing import LabelEncoder

def encode_cats(df, colums):

    encoder = LabelEncoder()

    for col in colums:
        df[col] = encoder.fit_transform(df[col])
    
    return df

In [30]:
from sklearn.feature_selection import SelectKBest, chi2

categorical_df = churn_df.select_dtypes(include="object")

cat_columns = categorical_df.columns.tolist()

categorical_df = encode_cats(categorical_df, cat_columns)


X = categorical_df
y = churn_df["Churn"]

selector = SelectKBest(score_func=chi2, k='all')
categorical_df_new = selector.fit_transform(X, y)

scores = selector.scores_

rank_features = pd.DataFrame({
    "features": cat_columns, 
    "chi2 scores":scores
}).sort_values(by="chi2 scores", ascending=False)

rank_features

Unnamed: 0,features,chi2 scores
0,Gender,5862.870662
1,Subscription Type,85.03309
2,Contract Length,0.003056


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def evaluate_model(model, X, y,ts):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)

    model_pipeline = Pipeline([
        ("scaler", StandardScaler()), 
        ("model", model)
    ])

    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    print("Accuracy:", np.mean(y_test == y_pred))

## Before Filtering 

In [43]:
from sklearn.linear_model import LogisticRegression

cat_columns = churn_df.select_dtypes(include="object").columns.to_list()

churn_df = encode_cats(churn_df, cat_columns)

X = churn_df.drop(["CustomerID", "Churn"], axis=1)
y = churn_df["Churn"]


model = LogisticRegression(random_state=42, max_iter=500)

evaluate_model(model=model, X=X, y=y, ts=0.2)

Accuracy: 0.850431567366475


## After Filtering

In [44]:
from sklearn.linear_model import LogisticRegression

X = churn_df.drop(["CustomerID", "Churn","Subscription Type","Contract Length"], axis=1)
y = churn_df["Churn"]


model = LogisticRegression(random_state=42, max_iter=100)

evaluate_model(model=model, X=X, y=y, ts=0.2)

Accuracy: 0.8501480145632719


- Overall, we could improve model's accuracy by 1% by applying the chi2 statistical technic