Logistic regression with k means clustering

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load Data

df = pd.read_csv("churn_customer.csv")

# Drop ID column

df = df.drop(columns=['customerID'])

# Convert TotalCharges to numeric (it has blanks sometimes)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Encode target
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# One-hot encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Separate Features & Target

X = df.drop(columns=['Churn'])
y = df['Churn']

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Clustering

kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster feature
X['Cluster'] = clusters

# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# Logistic Regression

model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))


# Cluster-wise Churn Rate

cluster_churn = pd.concat([X, y], axis=1).groupby('Cluster')['Churn'].mean()
print("Churn rate per cluster:")
print(cluster_churn)


Train shape: (5634, 31)
Test shape: (1409, 31)
Accuracy: 0.7381121362668559
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1035
           1       0.50      0.79      0.62       374

    accuracy                           0.74      1409
   macro avg       0.70      0.76      0.71      1409
weighted avg       0.80      0.74      0.75      1409

Churn rate per cluster:
Cluster
0    0.249267
1    0.466910
2    0.147143
3    0.074050
Name: Churn, dtype: float64


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy reduced after applying clustering in the telco customer churn dataset
But recall improved for churners(class 1)