In [77]:
import pandas as pd
import numpy as np




from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

In [78]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [80]:
# Convert categorical variables into numerical variables
from sklearn.preprocessing import LabelEncoder, StandardScaler
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

In [81]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,2,29.85,2505,No
1,5575-GNVDE,1,0,0,0,34,1,0,0,2,...,2,0,0,0,1,0,3,56.95,1466,No
2,3668-QPYBK,1,0,0,0,2,1,0,0,2,...,0,0,0,0,0,1,3,53.85,157,Yes
3,7795-CFOCW,1,0,0,0,45,0,1,0,2,...,2,2,0,0,1,0,0,42.3,1400,No
4,9237-HQITU,0,0,0,0,2,1,0,1,0,...,0,0,0,0,0,1,2,70.7,925,Yes


In [82]:
# Convert Churn variable into binary labels
df['Churn'] = (df['Churn'] == 'Yes').astype(int)

In [83]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,2,29.85,2505,0
1,5575-GNVDE,1,0,0,0,34,1,0,0,2,...,2,0,0,0,1,0,3,56.95,1466,0
2,3668-QPYBK,1,0,0,0,2,1,0,0,2,...,0,0,0,0,0,1,3,53.85,157,1
3,7795-CFOCW,1,0,0,0,45,0,1,0,2,...,2,2,0,0,1,0,0,42.3,1400,0
4,9237-HQITU,0,0,0,0,2,1,0,1,0,...,0,0,0,0,0,1,2,70.7,925,1


In [84]:
# Handle missing values
df = df.dropna()

In [85]:
X = df.drop(['Churn', 'customerID'], axis=1)
y = df['Churn']

In [86]:
# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5634 entries, 2142 to 860
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5634 non-null   int64  
 1   SeniorCitizen     5634 non-null   int64  
 2   Partner           5634 non-null   int64  
 3   Dependents        5634 non-null   int64  
 4   tenure            5634 non-null   int64  
 5   PhoneService      5634 non-null   int64  
 6   MultipleLines     5634 non-null   int64  
 7   InternetService   5634 non-null   int64  
 8   OnlineSecurity    5634 non-null   int64  
 9   OnlineBackup      5634 non-null   int64  
 10  DeviceProtection  5634 non-null   int64  
 11  TechSupport       5634 non-null   int64  
 12  StreamingTV       5634 non-null   int64  
 13  StreamingMovies   5634 non-null   int64  
 14  Contract          5634 non-null   int64  
 15  PaperlessBilling  5634 non-null   int64  
 16  PaymentMethod     5634 non-null   int64 

In [88]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

## Decision Tree Classifier

In [93]:
dt = DecisionTreeClassifier(criterion = 'entropy',random_state= 0) 
dt.fit(X_train,y_train)

y_pred = dt.predict(X_test)

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score 
print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test,y_pred))
print('Accuracy is ' + str(accuracy_score(y_test, y_pred)*100) + ' %.')

[[816 220]
 [189 184]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      1036
           1       0.46      0.49      0.47       373

    accuracy                           0.71      1409
   macro avg       0.63      0.64      0.64      1409
weighted avg       0.72      0.71      0.71      1409

Accuracy is 70.97232079489 %.


## Logistic Regression

In [95]:
lr = LogisticRegression(random_state= 0) 
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score 
print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test,y_pred))
print('Accuracy is ' + str(accuracy_score(y_test, y_pred)*100) + ' %.')

[[940  96]
 [163 210]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1036
           1       0.69      0.56      0.62       373

    accuracy                           0.82      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.81      0.82      0.81      1409

Accuracy is 81.61816891412349 %.


## Random Forest

In [97]:
rf = RandomForestClassifier(random_state= 0) 
rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score 
print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test,y_pred))
print('Accuracy is ' + str(accuracy_score(y_test, y_pred)*100) + ' %.')

[[945  91]
 [200 173]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.66      0.46      0.54       373

    accuracy                           0.79      1409
   macro avg       0.74      0.69      0.70      1409
weighted avg       0.78      0.79      0.78      1409

Accuracy is 79.347054648687 %.


## Gradient Boosting Classifier

In [99]:
gb = GradientBoostingClassifier(random_state= 0) 
gb.fit(X_train,y_train)

y_pred = gb.predict(X_test)

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score 
print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test,y_pred))
print('Accuracy is ' + str(accuracy_score(y_test, y_pred)*100) + ' %.')

[[945  91]
 [174 199]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1036
           1       0.69      0.53      0.60       373

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Accuracy is 81.19233498935415 %.


In [89]:
# Base models
models = [('lr', LogisticRegression(random_state=42)),
          ('dt', DecisionTreeClassifier(random_state=42)),
          ('rf', RandomForestClassifier(random_state=42)),
          ('gb', GradientBoostingClassifier(random_state=42)),
          ('nn', MLPClassifier(random_state=42))]

# Train base models and obtain their predictions
base_models = []
for name, model in models:
    pipeline = make_pipeline(clone(model))
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print(f'{name}: mean={np.mean(scores):.3f}, std={np.std(scores):.3f}')
    pipeline.fit(X_train, y_train)
    base_models.append(pipeline)

predictions = np.zeros((X_train.shape[0], len(models)))
for i, model in enumerate(base_models):
    predictions[:, i] = model.predict_proba(X_train)[:, 1]

# Meta-model
meta_model = LogisticRegression(random_state=42)
scores = cross_val_score(meta_model, predictions, y_train, cv=5, scoring='accuracy')
print(f'Meta-model: mean={np.mean(scores):.3f}, std={np.std(scores):.3f}')
meta_model.fit(predictions, y_train)

# Evaluate the stacked ensemble model on the test set
test_predictions = np.zeros((X_test.shape[0], len(models)))
for i, model in enumerate(base_models):
    test_predictions[:, i] = model.predict_proba(X_test)[:, 1]
stacked_predictions = meta_model.predict_proba(test_predictions)[:, 1]
accuracy = accuracy_score(y_test, stacked_predictions.round())
precision = precision_score(y_test, stacked_predictions.round())
recall = recall_score(y_test, stacked_predictions.round())
f1 = f1


lr: mean=0.800, std=0.011
dt: mean=0.729, std=0.010
rf: mean=0.794, std=0.010
gb: mean=0.800, std=0.008




nn: mean=0.777, std=0.008
Meta-model: mean=0.999, std=0.001




NameError: name 'f1' is not defined