In [95]:
import pandas as pd
import os
import kagglehub
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [60]:
# Download latest version
path = kagglehub.dataset_download("safrin03/predictive-analytics-for-customer-churn-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1


In [61]:
files = os.listdir(path)
print("Files in the dataset:")
for file_name in files:
    print(file_name)

Files in the dataset:
test.csv
train.csv
data_descriptions.csv


**Data Descriptions**

In [62]:
data_descriptions = pd.read_csv('/Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1/data_descriptions.csv')
data_descriptions

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,AccountAge,Feature,integer,The age of the user's account in months.
1,MonthlyCharges,Feature,float,The amount charged to the user on a monthly ba...
2,TotalCharges,Feature,float,The total charges incurred by the user over th...
3,SubscriptionType,Feature,object,The type of subscription chosen by the user (B...
4,PaymentMethod,Feature,string,The method of payment used by the user.
5,PaperlessBilling,Feature,string,Indicates whether the user has opted for paper...
6,ContentType,Feature,string,The type of content preferred by the user (Mov...
7,MultiDeviceAccess,Feature,string,Indicates whether the user has access to the s...
8,DeviceRegistered,Feature,string,"The type of device registered by the user (TV,..."
9,ViewingHoursPerWeek,Feature,float,The number of hours the user spends watching c...


There are 19 features, one target variable (Churn) and one identifier (CustomerID).

**Train Data**

In [63]:
train = pd.read_csv('/Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1/train.csv')
train.head()

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,...,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID,Churn
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,...,10,Sci-Fi,2.176498,4,Male,3,No,No,CB6SXPNVZA,0
1,57,5.175208,294.986882,Basic,Credit card,Yes,Movies,No,Tablet,32.450568,...,18,Action,3.478632,8,Male,23,No,Yes,S7R2G87O09,0
2,73,12.106657,883.785952,Basic,Mailed check,Yes,Movies,No,Computer,7.39516,...,23,Fantasy,4.238824,6,Male,1,Yes,Yes,EASDC20BDT,0
3,32,7.263743,232.439774,Basic,Electronic check,No,TV Shows,No,Tablet,27.960389,...,30,Drama,4.276013,2,Male,24,Yes,Yes,NPF69NT69N,0
4,57,16.953078,966.325422,Premium,Electronic check,Yes,TV Shows,No,TV,20.083397,...,20,Comedy,3.61617,4,Female,0,No,No,4LGYPK7VOL,0


In [64]:
def data_quality(data):
    missing = data.isnull().sum()
    unique_val = data.nunique()
    datatypes = data.dtypes
    check = pd.DataFrame({
        'missing': missing,
        'unique value': unique_val,
        'datatypes': datatypes
    })
    return check

In [65]:
train_check = data_quality(train)
train_check

Unnamed: 0,missing,unique value,datatypes
AccountAge,0,119,int64
MonthlyCharges,0,243787,float64
TotalCharges,0,243787,float64
SubscriptionType,0,3,object
PaymentMethod,0,4,object
PaperlessBilling,0,2,object
ContentType,0,3,object
MultiDeviceAccess,0,2,object
DeviceRegistered,0,4,object
ViewingHoursPerWeek,0,243787,float64


**Test Data**

In [66]:
test = pd.read_csv('/Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1/test.csv')
test.head()

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID
0,38,17.869374,679.036195,Premium,Mailed check,No,TV Shows,No,TV,29.126308,122.274031,42,Comedy,3.522724,2,Male,23,No,No,O1W6BHP6RM
1,77,9.912854,763.289768,Basic,Electronic check,Yes,TV Shows,No,TV,36.873729,57.093319,43,Action,2.021545,2,Female,22,Yes,No,LFR4X92X8H
2,5,15.019011,75.095057,Standard,Bank transfer,No,TV Shows,Yes,Computer,7.601729,140.414001,14,Sci-Fi,4.806126,2,Female,22,No,Yes,QM5GBIYODA
3,88,15.357406,1351.451692,Standard,Electronic check,No,Both,Yes,Tablet,35.58643,177.002419,14,Comedy,4.9439,0,Female,23,Yes,Yes,D9RXTK2K9F
4,91,12.406033,1128.949004,Standard,Credit card,Yes,TV Shows,Yes,Tablet,23.503651,70.308376,6,Drama,2.84688,6,Female,0,No,No,ENTCCHR1LR


In [67]:
test_check = data_quality(test)
test_check

Unnamed: 0,missing,unique value,datatypes
AccountAge,0,119,int64
MonthlyCharges,0,104480,float64
TotalCharges,0,104480,float64
SubscriptionType,0,3,object
PaymentMethod,0,4,object
PaperlessBilling,0,2,object
ContentType,0,3,object
MultiDeviceAccess,0,2,object
DeviceRegistered,0,4,object
ViewingHoursPerWeek,0,104480,float64


Unlike the train data, the test data does not contain the target variable, churn. 

### Models

Initially, we will evaluate multiple machine learning models to identify the best-performing one. Subsequently, this optimal model will be applied to the test data for predictions. Because the test data does not have the target variable.

First, remove the feature 'ColumnID'

In [68]:
train = train.drop('CustomerID', axis = 1)

Then, seperate the features and the target variable

In [69]:
X = train.drop('Churn', axis = 1)
y = train['Churn']

In [70]:
X_check = data_quality(X)

Now, train-test split

In [71]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Categorical features will be one-hot encoded while numerical features will be normalized. Therefore, first I need to identify these type of features.

In [72]:
categorical_features = X_check[X_check['datatypes'] == 'object'].index.to_list()
categorical_features

['SubscriptionType',
 'PaymentMethod',
 'PaperlessBilling',
 'ContentType',
 'MultiDeviceAccess',
 'DeviceRegistered',
 'GenrePreference',
 'Gender',
 'ParentalControl',
 'SubtitlesEnabled']

In [73]:
numerical_features = X_check[(X_check['datatypes'] == 'float64') | (X_check['datatypes'] == 'int64')].index.to_list()
numerical_features

['AccountAge',
 'MonthlyCharges',
 'TotalCharges',
 'ViewingHoursPerWeek',
 'AverageViewingDuration',
 'ContentDownloadsPerMonth',
 'UserRating',
 'SupportTicketsPerMonth',
 'WatchlistSize']

**Preprocessing pipeline - Standardization and One-hot Encoding**

In [74]:
preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features), ## standardization for numerical features
            ('cat', OneHotEncoder(), categorical_features) ## one-hot encoding for categorical features
        ]
    )

#### 1) Random Forest

In [75]:
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor), # preprocessing
    ('feature_selection', SelectFromModel(RandomForestClassifier())), #feature selection
    ('classifier', RandomForestClassifier()) # model
])

In [76]:
# model training
pipeline_rf.fit(X_train, y_train)

In [77]:
# prediction
pred_rf = pipeline_rf.predict(X_val)

In [78]:
print("Accuracy:", accuracy_score(y_val, pred_rf))
print("Classification Report:\n", classification_report(y_val, pred_rf))

Accuracy: 0.8202961565281595
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.98      0.90     39968
           1       0.51      0.11      0.18      8790

    accuracy                           0.82     48758
   macro avg       0.67      0.54      0.54     48758
weighted avg       0.77      0.82      0.77     48758



#### 2) LightGBM

In [79]:
pipeline_lgm = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('feature_selection', SelectFromModel(LGBMClassifier())),
    ('classifier', LGBMClassifier())
])

In [80]:
# model training
pipeline_lgm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 35392, number of negative: 159637
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1541
[LightGBM] [Info] Number of data points in the train set: 195029, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181470 -> initscore=-1.506417
[LightGBM] [Info] Start training from score -1.506417




[LightGBM] [Info] Number of positive: 35392, number of negative: 159637
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1483
[LightGBM] [Info] Number of data points in the train set: 195029, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181470 -> initscore=-1.506417
[LightGBM] [Info] Start training from score -1.506417


In [81]:
pred_lgm = pipeline_lgm.predict(X_val)



In [82]:
print("Accuracy:", accuracy_score(y_val, pred_lgm))
print("Classification Report:\n", classification_report(y_val, pred_lgm))

Accuracy: 0.8240698962221584
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.98      0.90     39968
           1       0.56      0.11      0.18      8790

    accuracy                           0.82     48758
   macro avg       0.70      0.54      0.54     48758
weighted avg       0.78      0.82      0.77     48758



#### 3) XGBoost

In [83]:
pipeline_xgb = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('feature_selection', SelectFromModel(XGBClassifier())),
    ('classifier', XGBClassifier())
])

In [84]:
pipeline_xgb.fit(X_train, y_train)

In [85]:
pred_xgb = pipeline_xgb.predict(X_val)

In [86]:
print("Accuracy:", accuracy_score(y_val, pred_xgb))
print("Classification Report:\n", classification_report(y_val, pred_xgb))

Accuracy: 0.8217318183682678
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.98      0.90     39968
           1       0.52      0.12      0.19      8790

    accuracy                           0.82     48758
   macro avg       0.68      0.55      0.55     48758
weighted avg       0.78      0.82      0.77     48758



#### 4) Logistic Regression

In [87]:
pipeline_lr = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('feature_selection', SelectFromModel(LogisticRegression())),
    ('classifier', LogisticRegression())
])

In [88]:
pipeline_lr.fit(X_train, y_train)

In [89]:
pred_lr = pipeline_lr.predict(X_val)

In [90]:
print("Accuracy:", accuracy_score(y_val, pred_lr))
print("Classification Report:\n", classification_report(y_val, pred_lr))

Accuracy: 0.8250543500553755
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.98      0.90     39968
           1       0.57      0.12      0.19      8790

    accuracy                           0.83     48758
   macro avg       0.70      0.55      0.55     48758
weighted avg       0.79      0.83      0.77     48758



#### 5) KNN

In [91]:
pipeline_knn = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('feature_selection', SelectKBest(k=10)),  # different selection method
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

In [92]:
pipeline_knn.fit(X_train, y_train)

In [93]:
pred_knn = pipeline_knn.predict(X_val)

In [94]:
print("Accuracy:", accuracy_score(y_val, pred_knn))
print("Classification Report:\n", classification_report(y_val, pred_knn))

Accuracy: 0.800484023134665
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.93      0.88     39968
           1       0.39      0.19      0.26      8790

    accuracy                           0.80     48758
   macro avg       0.62      0.56      0.57     48758
weighted avg       0.76      0.80      0.77     48758



#### 6) Voting Classifier

In [160]:
xgb = XGBClassifier(eval_metric='logloss')
lgbm = LGBMClassifier()
log_reg = LogisticRegression()

In [161]:
voting_classifier = VotingClassifier(estimators=[
    ('xgb', xgb),
    ('lgbm', lgbm),
    ('log_reg', log_reg)
], voting='soft')

In [162]:
pipeline_voting = Pipeline(steps=[
    ('preprocessing', preprocessor),  
    ('feature_selection', SelectKBest(k=10)), 
    ('voting', voting_classifier) 
])

In [163]:
pipeline_voting.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 35392, number of negative: 159637
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1208
[LightGBM] [Info] Number of data points in the train set: 195029, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181470 -> initscore=-1.506417
[LightGBM] [Info] Start training from score -1.506417


In [164]:
predictions = pipeline_voting.predict(X_val)

In [165]:
accuracy = accuracy_score(y_val, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8249312933262234


#### 7) Stacking Classifier

In [166]:
stacking_classifier = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('log_reg', log_reg)
    ],
    final_estimator=LogisticRegression(random_state=42)  # Meta-model
)

In [167]:
pipeline_stacking = Pipeline(steps=[
    ('preprocessing', preprocessor), 
    ('feature_selection', SelectKBest(k=10)), 
    ('stacking', stacking_classifier) 
])

In [168]:
pipeline_stacking.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 35392, number of negative: 159637
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1208
[LightGBM] [Info] Number of data points in the train set: 195029, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181470 -> initscore=-1.506417
[LightGBM] [Info] Start training from score -1.506417
[LightGBM] [Info] Number of positive: 28314, number of negative: 127709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000915 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1208
[LightGBM] [Info] Number of data points in the train set: 156023, number of used features: 10
[LightGBM] [In

In [169]:
predictions_stacking = pipeline_stacking.predict(X_val)

In [170]:
accuracy_stacking = accuracy_score(y_val, predictions_stacking)
print(f'Accuracy: {accuracy_stacking}')

Accuracy: 0.8251158784199516
