In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [3]:
df = pd.read_csv("connectsphere_churn_data.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,ContractLength,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,DataUsage,CallDuration
0,7590-VHVEG,Female,SeniorCitizen,Yes,No,1,No,No phone service,DSL,No,...,No,No,12,Yes,Electronic check,29.85,29.85,No,40,43
1,5575-GNVDE,Male,SeniorCitizen,No,No,34,Yes,No,DSL,Yes,...,No,No,3,No,Mailed check,56.95,1889.5,No,67,32
2,3668-QPYBK,Male,SeniorCitizen,No,No,2,Yes,No,DSL,Yes,...,No,No,4,Yes,Mailed check,53.85,108.15,Yes,43,43
3,7795-CFOCW,Male,SeniorCitizen,No,No,45,No,No phone service,DSL,Yes,...,No,No,2,No,Bank transfer (automatic),42.3,1840.75,No,67,12
4,9237-HQITU,Female,SeniorCitizen,No,No,2,Yes,No,Fiber optic,No,...,No,No,12,Yes,Electronic check,70.7,151.65,Yes,121,34


In [4]:
df.dropna(inplace=True)
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)


In [5]:
X = df[['CallDuration', 'DataUsage', 'ContractLength']]
y = df['Churn']


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)

rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)


In [9]:
log_preds = log_model.predict(X_test_scaled)
rf_preds = rf_model.predict(X_test)

print("Logistic Regression F1 Score:", f1_score(y_test, log_preds))
print("Random Forest F1 Score:", f1_score(y_test, rf_preds))


Logistic Regression F1 Score: 0.5
Random Forest F1 Score: 0.4


In [10]:
rf_probs = rf_model.predict_proba(X)[:, 1]
df['ChurnProbability'] = rf_probs

at_risk_customers = df[df['ChurnProbability'] > 0.6]
at_risk_customers.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingMovies,ContractLength,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,DataUsage,CallDuration,ChurnProbability
2,3668-QPYBK,Male,SeniorCitizen,No,No,2,Yes,No,DSL,Yes,...,No,4,Yes,Mailed check,53.85,108.15,1,43,43,0.66
4,9237-HQITU,Female,SeniorCitizen,No,No,2,Yes,No,Fiber optic,No,...,No,12,Yes,Electronic check,70.7,151.65,1,121,34,0.84
5,9305-CDSKC,Female,SeniorCitizen,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,5,Yes,Electronic check,99.65,820.5,1,46,55,0.67
13,0280-XJGEX,Male,SeniorCitizen,No,No,49,Yes,Yes,Fiber optic,No,...,Yes,6,Yes,Bank transfer (automatic),103.7,5036.3,1,367,32,0.74
18,4190-MFLUW,Female,SeniorCitizen,Yes,Yes,10,Yes,No,DSL,No,...,No,12,No,Credit card (automatic),55.2,528.35,1,231,32,0.86
