# Logistic Regression: Customer Churn Prediction

This notebook applies logistic regression to predict whether a customer will churn based on demographics and service usage information.

In [6]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load dataset
df = pd.read_csv('churn.csv')
df

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,MontlyCharges,Churn
0,30,Female,39,14,5,18,23.897436,1
1,65,Female,49,1,10,8,11.367347,1
2,55,Female,14,4,6,18,13.214286,1
3,58,Male,38,21,7,7,10.421053,1
4,23,Male,32,20,5,8,19.281250,1
...,...,...,...,...,...,...,...,...
440827,42,Male,54,15,1,3,13.266296,0
440828,25,Female,8,13,1,20,93.172500,0
440829,26,Male,35,27,1,5,27.923143,0
440830,28,Male,55,14,2,0,10.955455,0


In [None]:
#Encode Gender - male =0 and female = 1
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df


Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,MontlyCharges,Churn
0,30,1,39,14,5,18,23.897436,1
1,65,1,49,1,10,8,11.367347,1
2,55,1,14,4,6,18,13.214286,1
3,58,0,38,21,7,7,10.421053,1
4,23,0,32,20,5,8,19.281250,1
...,...,...,...,...,...,...,...,...
440827,42,0,54,15,1,3,13.266296,0
440828,25,1,8,13,1,20,93.172500,0
440829,26,0,35,27,1,5,27.923143,0
440830,28,0,55,14,2,0,10.955455,0


In [9]:
# Define predictors and target
# x = df[['Age', 'Gender', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'MonthlyCharges']] 
x = df.drop(columns=['Churn'], axis=1)
y = df['Churn']  # Target variable

In [11]:
# Fit logistic regression
x = sm.add_constant(x)  # Adds a constant term to the predictor
model = sm.Logit(y, x)

result = model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.413083
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:               440832
Model:                          Logit   Df Residuals:                   440824
Method:                           MLE   Df Model:                            7
Date:                Fri, 22 Aug 2025   Pseudo R-squ.:                  0.3962
Time:                        15:33:06   Log-Likelihood:            -1.8210e+05
converged:                       True   LL-Null:                   -3.0158e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -3.6560      0.022   -165.842      0.000      -3.699      -3.613
Age           

In [13]:
# Extract p-values and odds ratios
result.pvalues

const               0.000000e+00
Age                 0.000000e+00
Gender              0.000000e+00
Tenure              0.000000e+00
Usage Frequency    1.233593e-169
Support Calls       0.000000e+00
Payment Delay       0.000000e+00
MontlyCharges       0.000000e+00
dtype: float64

In [14]:
np.exp(result.params)

const              0.025837
Age                1.033294
Gender             2.218062
Tenure             0.986959
Usage Frequency    0.986710
Support Calls      1.865994
Payment Delay      1.101327
MontlyCharges      0.997636
dtype: float64

In [None]:
# this means that for a one unit increase in Age, the odds of Churn increase by a factor of 1.03, holding all other variables constant.
# for categorical data and results above 2, it is better to interpret the odds ratio in terms of comparison in increase likelyhood of Churn between the two categories.  

### 🧠 Interpretation Questions
1. Which variables are statistically significant? Answer: P_Value<0.05
2. Interpret the odds ratio for `Gender`. What does it tell you?
3. Which variables increase or decrease the odds of churn?

