In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('churn_eda.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,gender,subscription_type,watch_hours,last_login_days,region,device,monthly_fee,churned,payment_method,number_of_profiles,avg_watch_time_per_day,favorite_genre
0,0,51,Other,Basic,14.73,29,Africa,TV,8.99,1,Gift Card,1,0.49,Action
1,1,47,Other,Standard,0.7,19,Europe,Mobile,13.99,1,Gift Card,5,0.03,Sci-Fi
2,2,27,Female,Standard,16.32,10,Asia,TV,13.99,0,Crypto,2,1.48,Drama
3,3,53,Other,Premium,4.51,12,Oceania,TV,17.99,1,Crypto,2,0.35,Horror
4,4,56,Other,Standard,1.89,13,Africa,Mobile,13.99,1,Crypto,2,0.13,Action


In [None]:
df=df.drop(columns=['Unnamed: 0',]

In [106]:
X=df.drop(columns=['churned','last_login_days',])

y=df['churned']

In [107]:
# Identifying feature types

num_features=X.select_dtypes(include=['int64','float64']).columns
cat_features=X.select_dtypes(include=['object']).columns

In [108]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [109]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.pipeline import Pipeline

In [110]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),num_features),
        ('cat',OneHotEncoder(),cat_features)
    ]
)

In [111]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,roc_auc_score

In [112]:
model=Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('classifier',LogisticRegression(max_iter=1000))
    ]
)

In [113]:
model.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [114]:
y_pred=model.predict(X_test)
y_prob=model.predict_proba(X_test)[:,1]
print(classification_report(y_test,y_pred))
print("ROC-AUC:",roc_auc_score(y_test,y_prob))


              precision    recall  f1-score   support

           0       0.74      0.75      0.75       461
           1       0.78      0.78      0.78       537

    accuracy                           0.76       998
   macro avg       0.76      0.76      0.76       998
weighted avg       0.76      0.76      0.76       998

ROC-AUC: 0.8555686165206396


In [115]:
# Overfit check
y_prob_train=model.predict_proba(X_train)[:,1]
print("ROC-AUC:",roc_auc_score(y_train,y_prob_train))


ROC-AUC: 0.8495230052925778


In [116]:
threshold=0.39

In [117]:
artifacts={
    "model":model,
    "threshold":threshold
}

In [118]:
import pickle
with open("churn_model.pkl","wb") as f:
    pickle.dump(artifacts,f)

In [130]:
from sklearn.metrics import precision_recall_curve
y_probs=model.predict_proba(X_test)[:,1]
precision,recall,thresholds=precision_recall_curve(y_test,y_probs)
pr_data={
    "precision":precision,
    "recall":recall,
    "thresholds":thresholds
}

In [131]:
with open("pr_curve.pkl","wb") as f:
    pickle.dump(pr_data,f)