In [2]:
import pandas as pd
import os
import kagglehub
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download latest version
path = kagglehub.dataset_download("safrin03/predictive-analytics-for-customer-churn-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1


In [4]:
files = os.listdir(path)
print("Files in the dataset:")
for file_name in files:
    print(file_name)

Files in the dataset:
test.csv
train.csv
data_descriptions.csv


**Data Descriptions**

In [5]:
data_descriptions = pd.read_csv('/Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1/data_descriptions.csv')
data_descriptions

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,AccountAge,Feature,integer,The age of the user's account in months.
1,MonthlyCharges,Feature,float,The amount charged to the user on a monthly ba...
2,TotalCharges,Feature,float,The total charges incurred by the user over th...
3,SubscriptionType,Feature,object,The type of subscription chosen by the user (B...
4,PaymentMethod,Feature,string,The method of payment used by the user.
5,PaperlessBilling,Feature,string,Indicates whether the user has opted for paper...
6,ContentType,Feature,string,The type of content preferred by the user (Mov...
7,MultiDeviceAccess,Feature,string,Indicates whether the user has access to the s...
8,DeviceRegistered,Feature,string,"The type of device registered by the user (TV,..."
9,ViewingHoursPerWeek,Feature,float,The number of hours the user spends watching c...


There are 19 features, one target variable (Churn) and one identifier (CustomerID).

**Train Data**

In [6]:
train = pd.read_csv('/Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1/train.csv')
train.head()

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,...,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID,Churn
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,...,10,Sci-Fi,2.176498,4,Male,3,No,No,CB6SXPNVZA,0
1,57,5.175208,294.986882,Basic,Credit card,Yes,Movies,No,Tablet,32.450568,...,18,Action,3.478632,8,Male,23,No,Yes,S7R2G87O09,0
2,73,12.106657,883.785952,Basic,Mailed check,Yes,Movies,No,Computer,7.39516,...,23,Fantasy,4.238824,6,Male,1,Yes,Yes,EASDC20BDT,0
3,32,7.263743,232.439774,Basic,Electronic check,No,TV Shows,No,Tablet,27.960389,...,30,Drama,4.276013,2,Male,24,Yes,Yes,NPF69NT69N,0
4,57,16.953078,966.325422,Premium,Electronic check,Yes,TV Shows,No,TV,20.083397,...,20,Comedy,3.61617,4,Female,0,No,No,4LGYPK7VOL,0


In [7]:
def data_quality(data):
    missing = data.isnull().sum()
    unique_val = data.nunique()
    datatypes = data.dtypes
    check = pd.DataFrame({
        'missing': missing,
        'unique value': unique_val,
        'datatypes': datatypes
    })
    return check

In [8]:
train_check = data_quality(train)
train_check

Unnamed: 0,missing,unique value,datatypes
AccountAge,0,119,int64
MonthlyCharges,0,243787,float64
TotalCharges,0,243787,float64
SubscriptionType,0,3,object
PaymentMethod,0,4,object
PaperlessBilling,0,2,object
ContentType,0,3,object
MultiDeviceAccess,0,2,object
DeviceRegistered,0,4,object
ViewingHoursPerWeek,0,243787,float64


**Test Data**

In [9]:
test = pd.read_csv('/Users/gulsumasenacakir/.cache/kagglehub/datasets/safrin03/predictive-analytics-for-customer-churn-dataset/versions/1/test.csv')
test.head()

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,CustomerID
0,38,17.869374,679.036195,Premium,Mailed check,No,TV Shows,No,TV,29.126308,122.274031,42,Comedy,3.522724,2,Male,23,No,No,O1W6BHP6RM
1,77,9.912854,763.289768,Basic,Electronic check,Yes,TV Shows,No,TV,36.873729,57.093319,43,Action,2.021545,2,Female,22,Yes,No,LFR4X92X8H
2,5,15.019011,75.095057,Standard,Bank transfer,No,TV Shows,Yes,Computer,7.601729,140.414001,14,Sci-Fi,4.806126,2,Female,22,No,Yes,QM5GBIYODA
3,88,15.357406,1351.451692,Standard,Electronic check,No,Both,Yes,Tablet,35.58643,177.002419,14,Comedy,4.9439,0,Female,23,Yes,Yes,D9RXTK2K9F
4,91,12.406033,1128.949004,Standard,Credit card,Yes,TV Shows,Yes,Tablet,23.503651,70.308376,6,Drama,2.84688,6,Female,0,No,No,ENTCCHR1LR


In [10]:
test_check = data_quality(test)
test_check

Unnamed: 0,missing,unique value,datatypes
AccountAge,0,119,int64
MonthlyCharges,0,104480,float64
TotalCharges,0,104480,float64
SubscriptionType,0,3,object
PaymentMethod,0,4,object
PaperlessBilling,0,2,object
ContentType,0,3,object
MultiDeviceAccess,0,2,object
DeviceRegistered,0,4,object
ViewingHoursPerWeek,0,104480,float64


Unlike the train data, the test data does not contain the target variable, churn. 

### Models

Initially, I will evaluate multiple machine learning models with default settings to identify the best-performing ones. After selecting the top 2-3 models based on initial results, I’ll apply hyperparameter tuning to optimize their performance. Finally, I will combine these optimized models using ensemble techniques (stacking or voting classifiers) to achieve the most accurate predictions. This approach balances individual model strengths, reduces prediction errors, and provides a more robust final model by leveraging the unique insights of each model.

**Data Prep**

First, remove the feature 'ColumnID'

In [11]:
train = train.drop('CustomerID', axis = 1)

In [12]:
print(train['Churn'].value_counts())
print('/n')
print('Churn rate:', train['Churn'].sum()/len(train))

Churn
0    199605
1     44182
Name: count, dtype: int64
/n
Churn rate: 0.18123197709475894


Then, seperate the features and the target variable

In [13]:
X = train.drop('Churn', axis = 1)
y = train['Churn']

In [14]:
X_check = data_quality(X)

Now, train-test split

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Categorical features will be one-hot encoded while numerical features will be normalized. Therefore, first I need to identify these type of features.

In [16]:
categorical_features = X_check[X_check['datatypes'] == 'object'].index.to_list()
categorical_features

['SubscriptionType',
 'PaymentMethod',
 'PaperlessBilling',
 'ContentType',
 'MultiDeviceAccess',
 'DeviceRegistered',
 'GenrePreference',
 'Gender',
 'ParentalControl',
 'SubtitlesEnabled']

In [17]:
numerical_features = X_check[(X_check['datatypes'] == 'float64') | (X_check['datatypes'] == 'int64')].index.to_list()
numerical_features

['AccountAge',
 'MonthlyCharges',
 'TotalCharges',
 'ViewingHoursPerWeek',
 'AverageViewingDuration',
 'ContentDownloadsPerMonth',
 'UserRating',
 'SupportTicketsPerMonth',
 'WatchlistSize']

**Preprocessing pipeline - Standardization and One-hot Encoding**

In [18]:
preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features), ## standardization for numerical features
            ('cat', OneHotEncoder(), categorical_features) ## one-hot encoding for categorical features
        ]
    )

**A function to create, train, and evaluate a machine learning pipeline**

In [19]:
def create_model_pipeline(preprocessor, feature_selection, model):
    """
    A function to create, train, and evaluate a machine learning pipeline.
    
    Parameters:
        preprocessor: transformer
            The preprocessing steps: scaling and encoding.
        feature_selection: transformer
            The feature selection step, such as selecting important features.
        model: estimator
            The classification model to be used.
    
    Returns:
        pipeline: Pipeline object
            The trained pipeline.
        prediction: array
            The predictions on the validation set.
        metrics: dict
            A dictionary containing accuracy, precision, recall, and F1 score.
    """
    
    # Define the pipeline 
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),         # Preprocessing step 
        ('feature_selection', feature_selection), # Feature selection step
        ('classifier', model)                    # The model to be trained 
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the validation set
    prediction = pipeline.predict(X_val)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, prediction)
    precision = precision_score(y_val, prediction)
    recall = recall_score(y_val, prediction)
    f1 = f1_score(y_val, prediction)
    
    
    # Return the trained pipeline, predictions, and metrics
    return pipeline, prediction, accuracy, precision, recall, f1


#### 1) Random Forest

In [20]:
model_rf = RandomForestClassifier(class_weight='balanced')
pipeline_rf, pred_rf, accuracy_rf, precision_rf, recall_rf, f1_rf = create_model_pipeline(preprocessor=preprocessor,
                                                                                          feature_selection=SelectFromModel(model_rf), 
                                                                                          model=model_rf)
print('Accuracy - RF:', accuracy_rf)
print('Precision - RF:', precision_rf)
print('Recall - RF:', recall_rf)
print('F1 Score - RF:', f1_rf)


Accuracy - RF: 0.8207473645350507
Precision - RF: 0.5178062678062678
Recall - RF: 0.08270762229806598
F1 Score - RF: 0.14263292132627037


#### 2) LightGBM

In [54]:
ratio_nonchurn_to_churn = len(train[train['Churn'] == 0])/len(train[train['Churn'] == 1])
ratio_nonchurn_to_churn

4.517790050246707

In [55]:
model_lgbm = LGBMClassifier(scale_pos_weight=ratio_nonchurn_to_churn, verbose=-1)
pipeline_lgbm, pred_lgbm, accuracy_lgbm, precision_lgbm, recall_lgbm, f1_lgbm = create_model_pipeline(preprocessor=preprocessor, 
                                                                                                      feature_selection=SelectFromModel(model_lgbm), 
                                                                                                      model=model_lgbm)
print('Accuracy - LightGBM:', accuracy_lgbm)
print('Precision - LightGBM:', precision_lgbm)
print('Recall - LightGBM:', recall_lgbm)
print('F1 Score - LightGBM:', f1_lgbm)



Accuracy - LightGBM: 0.6703720415111366
Precision - LightGBM: 0.3134351301496208
Recall - LightGBM: 0.6959044368600683
F1 Score - LightGBM: 0.43220518617960857




#### 3) XGBoost

In [36]:
model_xgb = XGBClassifier(scale_pos_weight=ratio_nonchurn_to_churn)
pipeline_xgb, pred_xgb, accuracy_xgb, precision_xgb, recall_xgb, f1_xgb = create_model_pipeline(preprocessor=preprocessor, 
                                                                                                   feature_selection=SelectFromModel(model_xgb), 
                                                                                                   model=model_xgb)
print('Accuracy - XGBoost:', accuracy_xgb)
print('Precision - XGBoost:', precision_xgb)
print('Recall - XGBoost:', recall_xgb)
print('F1 Score - XGBoost:', f1_xgb)

Accuracy - XGBoost: 0.680401164937036
Precision - XGBoost: 0.31639548083680197
Recall - XGBoost: 0.6658703071672355
F1 Score - XGBoost: 0.42896405144930194


#### 4) Logistic Regression

In [21]:
model_lr = LogisticRegression(class_weight='balanced')
pipeline_lr, pred_lr, accuracy_lr, precision_lr, recall_lr, f1_lr = create_model_pipeline(preprocessor=preprocessor, 
                                                                                          feature_selection=SelectFromModel(model_lr), 
                                                                                          model=model_lr)
print('Accuracy - LR:', accuracy_lr)
print('Precision - LR:', precision_lr)
print('Recall - LR:', recall_lr)
print('F1 Score - LR:', f1_lr)

Accuracy - LR: 0.6775298412568194
Precision - LR: 0.3191045243437875
Recall - LR: 0.6956769055745164
F1 Score - LR: 0.4375201230637141


#### 5) KNN

In [23]:
model_knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
pipeline_knn, pred_knn, accuracy_knn, precision_knn, recall_knn, f1_knn = create_model_pipeline(preprocessor=preprocessor, 
                                                                                                feature_selection=SelectKBest(k=10), 
                                                                                                model=model_knn)
print('Accuracy - KNN:', accuracy_knn)
print('Precision - KNN:', precision_knn)
print('Recall - KNN:', recall_knn)
print('F1 Score - KNN:', f1_knn)

Accuracy - KNN: 0.800484023134665
Precision - KNN: 0.39158576051779936
Recall - KNN: 0.19271899886234356
F1 Score - KNN: 0.2583104605062519


In [37]:
models_result = pd.DataFrame({
    'Models': ['Random Forest', 'LightGBM', 'XGBoost', 'Logistic Regression', 'KNN'],
    'Accuracy':[accuracy_rf, accuracy_lgbm, accuracy_xgb, accuracy_lr, accuracy_knn],
    'Precision': [precision_rf, precision_lgbm, precision_xgb, precision_lr, precision_knn],
    'Recall': [recall_rf, recall_lgbm, recall_xgb, recall_lr, recall_knn],
    'F1 Score': [f1_rf, f1_lgbm, f1_xgb, f1_lr, f1_knn]
})
models_result

Unnamed: 0,Models,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.820747,0.517806,0.082708,0.142633
1,LightGBM,0.6739,0.316393,0.696928,0.435209
2,XGBoost,0.680401,0.316395,0.66587,0.428964
3,Logistic Regression,0.67753,0.319105,0.695677,0.43752
4,KNN,0.800484,0.391586,0.192719,0.25831


In [68]:
def highlight_extremes(s):
    return ['color: blue' if v == s.max() else 'color: red' if v == s.min() else '' for v in s]

styled_result = models_result.style.apply(highlight_extremes, subset=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
styled_result

Unnamed: 0,Models,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.820747,0.517806,0.082708,0.142633
1,LightGBM,0.6739,0.316393,0.696928,0.435209
2,XGBoost,0.680401,0.316395,0.66587,0.428964
3,Logistic Regression,0.67753,0.319105,0.695677,0.43752
4,KNN,0.800484,0.391586,0.192719,0.25831


### Hyperparameter Tunning

In [39]:
def tuning_with_pipeline(preprocessor, feature_selection, model, param_grid):
    """
    A function to create, train, and evaluate a machine learning pipeline with hyperparameter tuning.
    
    Parameters:
        preprocessor: transformer
            The preprocessing steps: scaling and encoding.
        feature_selection: transformer
            The feature selection step, such as selecting important features.
        model: estimator
            The classification model to be used.
        param_grid: dict
            The hyperparameter grid for tuning.
    
    Returns:
        best_pipeline: Pipeline object
            The trained pipeline with the best hyperparameters.
        best_prediction: array
            The predictions on the validation set using the best model.
        metrics:
            Accuracy, precision, recall, and F1 score.
    """
    
    # Define the pipeline 
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),         # Preprocessing step 
        ('feature_selection', feature_selection), # Feature selection step
        ('classifier', model)                    # The model to be trained 
    ])
    
    # Hyperparameter tuning
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
    
    # Train the pipeline
    grid_search.fit(X_train, y_train)
    
    # Get the best pipeline
    best_pipeline = grid_search.best_estimator_
    
    # Make predictions with the best model
    best_prediction = best_pipeline.predict(X_val)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, best_prediction)
    precision = precision_score(y_val, best_prediction)
    recall = recall_score(y_val, best_prediction)
    f1 = f1_score(y_val, best_prediction)
    
    # Return the best trained pipeline, predictions, and metrics
    return best_pipeline, best_prediction, accuracy, precision, recall, f1


In [47]:
param_grid_lgbm = {
    'classifier__num_leaves': [31, 50, 100], 
    'classifier__max_depth': [-1, 10, 20, 30], 
    'classifier__subsample': [0.6, 0.8, 1.0] 
}

In [56]:
best_pipeline_lgbm, best_prediction_lgbm, accuracy_lgbm, precision_lgbm, recall_lgbm, f1_lgbm = tuning_with_pipeline(preprocessor, 
                                                                                                               SelectFromModel(model_lgbm), 
                                                                                                               model_lgbm, 
                                                                                                               param_grid_lgbm)


  _data = np.array(data, dtype=dtype, copy=copy,


In [57]:
print('F1 Score - LGBM:', f1_lgbm)

F1 Score - LGBM: 0.4330071708643894


In [61]:
print("Best Pipeline lgbm:", best_pipeline_lgbm.named_steps['classifier'].get_params())

Best Pipeline lgbm: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': 10, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.6, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'scale_pos_weight': 4.517790050246707, 'verbose': -1}


In [40]:
param_grid_xgb = {
    'classifier__learning_rate': [0.1, 0.2, 0.3],
    'classifier__max_depth': [7, 10, 15],
    'classifier__n_estimators': [100, 300, 700],
    'classifier__reg_lambda': [0, 0.1, 1],
    'classifier__nthread': [-1]
}

In [41]:
best_pipeline_xgb, best_pred_xgb, accuracy_xgb, precision_xgb, recall_xgb, f1_xgb = tuning_with_pipeline(preprocessor=preprocessor, 
                                                                                                         feature_selection=SelectFromModel(model_xgb), 
                                                                                                         model=model_xgb,
                                                                                                         param_grid=param_grid_xgb)



In [42]:
print('F1 Score - XGBoost:', f1_xgb)
print("Best Pipeline XGBoost:", best_pipeline_xgb.named_steps['classifier'].get_params())

F1 Score - XGBoost: 0.4338990458154272
Best Pipeline XGBoost: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 7, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': 0.1, 'sampling_method': None, 'scale_pos_weight': 4.517790050246707, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'nthread': -1}


In [43]:
param_grid_lr = {
    'classifier__solver': ['lbfgs', 'liblinear', 'saga', 'newton-cholesky'],
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__max_iter': [1000]
}

In [44]:
best_pipeline_lr, best_pred_lr, accuracy_lr, precision_lr, recall_lr, f1_lr = tuning_with_pipeline(preprocessor=preprocessor, 
                                                                                                   feature_selection=SelectFromModel(model_lr), 
                                                                                                    model=model_lr,
                                                                                                    param_grid=param_grid_lr)

In [45]:
print('F1 Score - LR:', f1_lr)
print("Best Pipeline LR:", best_pipeline_lr.named_steps['classifier'].get_params())

F1 Score - LR: 0.43736137940902914
Best Pipeline LR: {'C': 0.01, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


### Ensemble Model: Stacking Classifier

In [63]:
xgb = XGBClassifier(eval_metric='logloss', learning_rate =0.1, max_depth = 7, n_estimators = 100, reg_lambda= 0.1, scale_pos_weight=ratio_nonchurn_to_churn)
log_reg = LogisticRegression(max_iter=1000, penalty='l2', solver='liblinear', C = 0.01, class_weight = 'balanced')
lgbm = LGBMClassifier(subsample = 0.6, scale_pos_weight=ratio_nonchurn_to_churn,max_depth = 10, num_leaves = 31)


In [69]:
stacking_classifier = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('log_reg', log_reg)
    ],
    final_estimator=log_reg # Meta-model
)

In [82]:
pipeline_stacking = Pipeline(steps=[
    ('preprocessing', preprocessor), 
    ('feature_selection', SelectFromModel(log_reg)), 
    ('stacking', stacking_classifier) 
])

In [83]:
pipeline_stacking.fit(X_train, y_train)
predictions_stacking = pipeline_stacking.predict(X_val)

In [84]:
f1_stacking = f1_score(y_val, predictions_stacking)
print(f'F1 Score - Stacking: {f1_stacking}')

F1 Score - Stacking: 0.4358367202685987
