## Import Library

In [373]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [374]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE 

from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, recall_score

## Load dataset

In [375]:
df = pd.read_csv('D:\BOOTCAMP\Day 29 - Exploring Machine Learning Models with Python (Supervised) Part II\dataset\WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [376]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Data Cleaning

In [377]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [378]:
#memeriksa duplikasi nilai pada setiap kolom
df[df.duplicated]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


* tidak ada data yang hilang atau terdapat duplikat pada dataset

In [379]:
#menghapus white space pada kolom
df['TotalCharges'] = df['TotalCharges'].str.strip()
#mengubah str menjadi numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [380]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


* data berkurang pada kolom TotalCharges

In [381]:
#fill NA dengan median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
#mengubah nilai pada kolom Churn
df['Churn'] = df['Churn'].replace("No", 0).replace("Yes", 1)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


### Data Preprocessing

In [382]:
data =pd.get_dummies(data=df,columns=['gender', 'Partner', 'Dependents', 
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'],drop_first=True)

#mengubah nilai true/false menjadi 1/0 untuk seluruh kolom boolean
data = data.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
data.head()


Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,5575-GNVDE,0,34,56.95,1889.5,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,0,2,53.85,108.15,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,7795-CFOCW,0,45,42.3,1840.75,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,9237-HQITU,0,2,70.7,151.65,1,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0


In [383]:
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
data.TotalCharges = imputer.fit_transform(data["TotalCharges"].values.reshape(-1, 1))

## Split Dataset

In [384]:
sm = SMOTE(random_state=42)
x = data.drop(['Churn','customerID'],axis = 1)
y = data['Churn']
x_sm, y_sm = sm.fit_resample(x, y)

print(f'''Shape of x before SMOTE: {x.shape}
Shape of x after SMOTE: {x_sm.shape}''')

print('\nBalance of positive and negative classes (%):')
y_sm.value_counts(normalize=True) * 100

Shape of x before SMOTE: (7043, 30)
Shape of x after SMOTE: (10348, 30)

Balance of positive and negative classes (%):


Churn
0    50.0
1    50.0
Name: proportion, dtype: float64

## Train and Val

In [385]:
#train dan validation
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

In [386]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

### Modeling (menggunakan 2 model dan bandingkan hasil evaluasinya)

In [387]:
cly = [LogisticRegression(),
      RandomForestClassifier(),
      XGBClassifier(random_state=42)]

df = pd.DataFrame(columns = ['Method', 'Accuracy'])
df

Unnamed: 0,Method,Accuracy


In [388]:
for model in cly:
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    
    method = str(type(model)).split('.')[-1][:-2]
    
    # acc
    Accuracy = accuracy_score(y_test, preds)
    
    new_row = pd.DataFrame({'Method': [method], 
                    'Accuracy': [Accuracy]})
                   
    #menggunakan pd.concat untuk menambahkan new_row ke df
    df = pd.concat([df, new_row], ignore_index=True)

In [None]:
df

Unnamed: 0,Method,Accuracy
0,LogisticRegression,0.810335
1,RandomForestClassifier,0.791596
2,XGBClassifier,0.793299


In [390]:
#scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_std = scaler.transform(x_train)

## cross validation

In [391]:
from sklearn.model_selection import cross_val_score

# Inisialisasi Random Forest Classifier
model = LogisticRegression()

# Cross-validation dengan f1_score sebagai metric
cv_scores = cross_val_score(model, x_std, y_train, cv=5, scoring='f1')

# Print hasil
print("F1-score rata-rata:", cv_scores)
print("F1-score rata-rata:", cv_scores.mean())

F1-score rata-rata: [0.61923077 0.5841785  0.58333333 0.55984556 0.57668712]
F1-score rata-rata: 0.5846550555919763


In [392]:
x_std.shape, y_train.shape

((5282, 30), (5282,))

In [398]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, make_scorer

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Definisi scorer untuk F1 score hanya untuk label 1
scorer = make_scorer(f1_score, pos_label=1)

# Melakukan cross-validation dan menghitung skor
scores = cross_val_score(model, x_std, y_train, cv=kf, scoring=scorer)
scores, scores.mean()

(array([0.61254613, 0.58436214, 0.52173913, 0.6015625 , 0.63082437,
        0.57268722, 0.59836066, 0.61538462, 0.56716418, 0.52066116]),
 np.float64(0.5825292100494786))

In [403]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = LogisticRegression()

# Inisialisasi KFold split dengan 5 kelompok
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics
f1_scores = []
recall_scores = []
precision_scores = []

for train_index, test_index in kf.split(x_std):
  x_train_, x_val = x_std[train_index], x_std[test_index]
  y_train_, y_val = y_train.values[train_index], y_train.values[test_index]

  model.fit(x_train_, y_train_)
  y_pred = model.predict(x_val)


  f1_scores.append(f1_score(y_val, y_pred))
  recall_scores.append(precision_score(y_val, y_pred))
  precision_scores.append(f1_score(y_val, y_pred))

f1_mean, mae_std = np.mean(f1_scores), np.std(f1_scores)
recall_mean, recall_std = np.mean(recall_scores), np.std(recall_scores)
precision_mean,precision_std = np.mean(precision_scores), np.std(precision_scores)

## Test

In [404]:
X_test_std = scaler.transform(x_test)

In [405]:
y_test_pred = model.predict(X_test_std)

## Evaluation

In [409]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      1282
           1       0.69      0.58      0.63       479

    accuracy                           0.81      1761
   macro avg       0.77      0.74      0.75      1761
weighted avg       0.81      0.81      0.81      1761



model terbaik menggunakan LogisticRegression, kemungkinan besar RandomForestClassifier dan XGBClassifier adalah model yang lebih kompleks dan bisa saja terlalu cocok (overfit) dengan data pelatihan. 

kelas 0 mewakili pelanggan yang tidak churn (belum berhenti berlangganan).
kelas 1 mewakili pelanggan yang churn (sudah berhenti berlangganan).
* model memiliki performa yang baik dalam mendeteksi kelas 0 dengan nilai precision, recall, dan F1-score yang lebih tinggi dibandingkan kelas 1.
* pada kelas 1, precision dan recall lebih rendah, menunjukkan model mengalami kesulitan dalam mengidentifikasi sampel positif dengan benar.
* macro avg dan weighted avg memberikan gambaran umum dari performa model, dengan weighted avg memperhitungkan jumlah sampel di setiap kelas 
menghasilkan nilai yang sama dengan akurasi model.
* accuracy: Persentase prediksi yang benar untuk seluruh sampel (0.81 atau 81%).


### Potential impact

1. precision, recall, dan F1-score untuk kelas 0:
Precision 0.85, recall 0.90, dan F1-score 0.88 menunjukkan bahwa model cukup baik dalam mengenali pelanggan yang tetap berlangganan (tidak churn).

2. precision, recall, dan F1-score untuk kelas 1 (churn):
Precision 0.69, recall 0.58, dan F1-score 0.63 menunjukkan bahwa model agak kurang optimal dalam mendeteksi pelanggan yang churn, dengan recall yang relatif rendah (58%).

3. sekitar 42% pelanggan churn dalam data mungkin tidak terdeteksi (false negatives). Ini penting jika kamu ingin model lebih akurat dalam mengidentifikasi pelanggan yang churn agar bisa dilakukan tindakan pencegahan.