In [234]:
# KELOMPOK 5
# AGUS ZULVANI    : 23/528776/PPA/06687
# ALIFIA INTAN AN : 23/530614/PPA/06740

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import Bunch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [235]:
# DATA SOURCE
file_path = "/Users/aguszulvani/Downloads/churn/telecom_customer_churn.csv"
df_orig = pd.read_csv(file_path, sep = ',')

df = df_orig

# DATA PREPARATION
## DROP UNUSED FEATURES (Customer ID is no longer used, it is not affected dependence variable)
df.drop(columns=['Customer ID', 'Churn Reason', 'Churn Category'], inplace=True)

print("Original Data: ", df_orig.shape)
# print(df.dtypes)
df.head(2)

Original Data:  (7043, 35)


Unnamed: 0,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,Internet Type,Avg Monthly GB Download,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,9,,Yes,42.39,No,Yes,Cable,16.0,No,Yes,No,Yes,Yes,No,No,Yes,One Year,Yes,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed
1,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,9,,Yes,10.69,Yes,Yes,Cable,10.0,No,No,No,No,No,Yes,Yes,No,Month-to-Month,No,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed


In [236]:
## CONVERT CATEGORICAL TO NUMERICAL FEATURES
# mapping_churn_status = {'Churned': 1, 'Stayed': 2, 'Joined': 3}
# df['Customer Status'] = df['Gender'].map(mapping_churn_status).fillna(0).astype(int)

mapping_gender = {'Male': 1, 'Female': 2}
df['Gender'] = df['Gender'].map(mapping_gender).fillna(0).astype(int)

mapping_yes_no = {'Yes': 1, 'No': 2}
df['Married'] = df['Married'].map(mapping_yes_no).fillna(0).astype(int)
df['Phone Service'] = df['Phone Service'].map(mapping_yes_no).fillna(0).astype(int)
df['Multiple Lines'] = df['Multiple Lines'].map(mapping_yes_no).fillna(0).astype(int)
df['Internet Service'] = df['Internet Service'].map(mapping_yes_no).fillna(0).astype(int)
df['Online Backup'] = df['Online Backup'].map(mapping_yes_no).fillna(0).astype(int)
df['Online Security'] = df['Online Security'].map(mapping_yes_no).fillna(0).astype(int)
df['Device Protection Plan'] = df['Device Protection Plan'].map(mapping_yes_no).fillna(0).astype(int)
df['Premium Tech Support'] = df['Premium Tech Support'].map(mapping_yes_no).fillna(0).astype(int)
df['Streaming TV'] = df['Streaming TV'].map(mapping_yes_no).fillna(0).astype(int)
df['Streaming Movies'] = df['Streaming Movies'].map(mapping_yes_no).fillna(0).astype(int)
df['Streaming Music'] = df['Streaming Music'].map(mapping_yes_no).fillna(0).astype(int)
df['Unlimited Data'] = df['Unlimited Data'].map(mapping_yes_no).fillna(0).astype(int)
df['Paperless Billing'] = df['Paperless Billing'].map(mapping_yes_no).fillna(0).astype(int)

df = df.fillna(0)
categorical_columns = ['City', 'Zip Code', 'Offer', 'Internet Type', 'Contract', 'Payment Method'] 
df_encoded = pd.get_dummies(df, columns=categorical_columns)

print("Data (Categorical to Numerical Conversion): ", df_encoded.shape)
print("Data (Without Categorical Numerical Conversion): ", df.shape)
# df.head(2)

# has_negative = (df_encoded < 0).any().any()
# print("Apakah ada nilai negatif di DataFrame?", has_negative)

Data (Categorical to Numerical Conversion):  (7043, 2777)
Data (Without Categorical Numerical Conversion):  (7043, 35)


In [237]:
# DATA INDEPENDENT VS DEPENDENT SPLITING
## DEPENDENT FEATURE
target_column = 'Customer Status'

## SEPARATE DEPDENDENT VS INDEPENDENT FEATURES
X = df_encoded.drop(target_column, axis=1)
y = df_encoded[target_column]

In [273]:
# FEATURE SELECTION ANOVA
## Data normalization
anova_n = 200
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Calculate F value and P value (ANOVA)
f_values, p_values = f_classif(X_scaled, y)

feature_names = X.columns
anova_results = pd.DataFrame({'Feature': feature_names, 'F-Value': f_values, 'P-Value': p_values})
anova_results = anova_results.sort_values(by='F-Value', ascending=False)
# print("ANOVA F Score vs P Value")
# print("----------------------------------------")
# print(anova_results)

# Membuat objek SelectKBest dengan ANOVA sebagai fungsi skor
k_best = SelectKBest(score_func=f_classif, k=anova_n)  

# Melakukan seleksi fitur pada dataset
X_selected = k_best.fit_transform(X, y)

# Mendapatkan fitur-fitur terpilih
selected_features = k_best.get_support(indices=True)

# Mendapatkan nama fitur dan nama kelas
feature_names = list(X.columns)
target_names = y.unique()

selected_features_ = []
for i in selected_features:
    selected_features_.append(feature_names[i])

# drop unselected columns
selected_features_.append(target_column)
columns_to_drop = [col for col in df_encoded.columns if col not in selected_features_]
df_selected = df_encoded.drop(columns=columns_to_drop)
print("Data After Feature Selection: ", df_encoded.shape, df_selected.shape)

# Membuat objek Bunch
# bunch = Bunch(data=X.values, target=y.values, feature_names=feature_names, target_names=target_names)

Data After Feature Selection:  (7043, 2777) (7043, 201)


In [275]:
# print("\n")
# print("ANOVA Selected Features")
# print("----------------------------------------")
# for i in selected_features:
#     print(f"Fitur {i}: {bunch.feature_names[i]} : F-value = {f_values[i]}")

# print("\n")
# print("ANOVA Selected Features")
# print("----------------------------------------")
# for i in selected_features:
#     print(feature_names[i])

# print("selected features: ")
# print(selected_features_)

# columns_to_drop = ['Internet Type_0', 'Offer_Offer E']
# print("\ndrop columns: ")
# print(columns_to_drop)

# print("\nColumn Diff: ",len(selected_features), (len(columns_to_drop) - len(selected_features)))
# print("\nData Original: ", df.shape)
# print("Data Encoded: ", df_encoded.shape)

# print(df_selected.columns)
# df_selected.head(2)

In [274]:
# AFTER FEATURE SELECTION
# DATA INDEPENDENT VS DEPENDENT SPLITING
## DEPENDENT FEATURE

## SEPARATE DEPDENDENT VS INDEPENDENT FEATURES
X_anova = df_selected.drop(target_column, axis=1)
y_anova = df_selected[target_column]

# DATA TRAIN VS TEST SPLITTING
## ORIGINAL
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## ANOVA
X_train_anova, X_test_anova, y_train_anova, y_test_anova = train_test_split(X_anova, y_anova, test_size=0.25, random_state=42)

In [276]:
# GAUSIAN NAIVE BAYES
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gnb))
# print("Gaussian Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_gnb))

gnb_anova = GaussianNB()
gnb_anova.fit(X_train_anova, y_train_anova)
y_pred_gnb_anova = gnb_anova.predict(X_test_anova)
print("ANOVA: ", anova_n, " Gaussian Naive Bayes Accuracy:", accuracy_score(y_test_anova, y_pred_gnb_anova))
# print("ANOVA Gaussian Naive Bayes Classification Report:\n", classification_report(y_test_anova, y_pred_gnb_anova))

Gaussian Naive Bayes Accuracy: 0.6672345258375922
ANOVA:  200  Gaussian Naive Bayes Accuracy: 0.7761476573592049


In [277]:
# RANDOM FOREST
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

rf_anova = RandomForestClassifier(n_estimators=100, random_state=42)
rf_anova.fit(X_train_anova, y_train_anova)
y_pred_rf_anova = rf_anova.predict(X_test_anova)
print("ANOVA: ", anova_n, " Random Forest Accuracy:", accuracy_score(y_test_anova, y_pred_rf_anova))
# print("ANOVA Random Forest Classification Report:\n", classification_report(y_test_anova, y_pred_rf_anova))

Random Forest Accuracy: 0.8370244179443498
ANOVA:  200  Random Forest Accuracy: 0.8428774254614293


In [278]:
# MLP
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
# print("MLP Classification Report:\n", classification_report(y_test, y_pred_mlp))

mlp_anova = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_anova.fit(X_train_anova, y_train_anova)
y_pred_mlp_anova = mlp_anova.predict(X_test_anova)
print("ANOVA: ", anova_n, " MLP Accuracy:", accuracy_score(y_test_anova, y_pred_mlp_anova))
# print("ANOVA MLP Classification Report:\n", classification_report(y_test_anova, y_pred_mlp_anova))

MLP Accuracy: 0.7910278250993753
ANOVA:  200  MLP Accuracy: 0.8258400378608614


In [284]:
# FEATURE SELECTION MUTAL INFORMATION
from sklearn.feature_selection import mutual_info_classif

mi_n = 200

# Hitung mutual information
mi = mutual_info_classif(X, y)

# Menampilkan hasil
# mi_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mi}).sort_values(by='Mutual Information', ascending=False)
# mi_df.head(20)

k_best_mi = SelectKBest(score_func=mutual_info_classif, k=mi_n)  

# Melakukan seleksi fitur pada dataset
X_selected_mi = k_best_mi.fit_transform(X, y)

# Mendapatkan fitur-fitur terpilih
selected_features_mi = k_best_mi.get_support(indices=True)

selected_features_mi_ = []
for i in selected_features_mi:
    selected_features_mi_.append(feature_names[i])

# drop unselected columns
selected_features_mi_.append(target_column)
columns_to_drop_mi = [col for col in df_encoded.columns if col not in selected_features_mi_]
df_selected_mi = df_encoded.drop(columns=columns_to_drop_mi)
print("Data After Feature Selection: ", df_encoded.shape, df_selected_mi.shape)


Data After Feature Selection:  (7043, 2777) (7043, 201)


In [285]:
# AFTER FEATURE SELECTION
# DATA INDEPENDENT VS DEPENDENT SPLITING
## DEPENDENT FEATURE

## SEPARATE DEPDENDENT VS INDEPENDENT FEATURES
X_mi = df_selected_mi.drop(target_column, axis=1)
y_mi = df_selected_mi[target_column]

## ANOVA
X_train_mi, X_test_mi, y_train_mi, y_test_mi = train_test_split(X_mi, y_mi, test_size=0.25, random_state=42)

In [286]:
# GAUSIAN NAIVE BAYES
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gnb))
# print("Gaussian Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_gnb))

gnb_anova = GaussianNB()
gnb_anova.fit(X_train_anova, y_train_anova)
y_pred_gnb_anova = gnb_anova.predict(X_test_anova)
print("ANOVA: ", anova_n, " Gaussian Naive Bayes Accuracy:", accuracy_score(y_test_anova, y_pred_gnb_anova))
# print("ANOVA Gaussian Naive Bayes Classification Report:\n", classification_report(y_test_anova, y_pred_gnb_anova))

gnb_mi = GaussianNB()
gnb_mi.fit(X_train_mi, y_train_mi)
y_pred_gnb_mi = gnb_mi.predict(X_test_mi)
print("MI: ", mi_n, " Gaussian Naive Bayes Accuracy:", accuracy_score(y_test_mi, y_pred_gnb_mi))
# print("ANOVA Gaussian Naive Bayes Classification Report:\n", classification_report(y_test_anova, y_pred_gnb_anova))

Gaussian Naive Bayes Accuracy: 0.6672345258375922
ANOVA:  200  Gaussian Naive Bayes Accuracy: 0.7761476573592049
MI:  200  Gaussian Naive Bayes Accuracy: 0.7160704145371948


In [287]:
# RANDOM FOREST
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

rf_anova = RandomForestClassifier(n_estimators=100, random_state=42)
rf_anova.fit(X_train_anova, y_train_anova)
y_pred_rf_anova = rf_anova.predict(X_test_anova)
print("ANOVA: ", anova_n, " Random Forest Accuracy:", accuracy_score(y_test_anova, y_pred_rf_anova))
# print("ANOVA Random Forest Classification Report:\n", classification_report(y_test_anova, y_pred_rf_anova))

rf_mi = RandomForestClassifier(n_estimators=100, random_state=42)
rf_mi.fit(X_train_mi, y_train_mi)
y_pred_rf_mi = rf_mi.predict(X_test_mi)
print("MI: ", mi_n, " Random Forest Accuracy:", accuracy_score(y_test_mi, y_pred_rf_mi))

Random Forest Accuracy: 0.8370244179443498
ANOVA:  200  Random Forest Accuracy: 0.8428774254614293
MI:  200  Random Forest Accuracy: 0.8398637137989778


In [288]:
# MLP
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
# print("MLP Classification Report:\n", classification_report(y_test, y_pred_mlp))

mlp_anova = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_anova.fit(X_train_anova, y_train_anova)
y_pred_mlp_anova = mlp_anova.predict(X_test_anova)
print("ANOVA: ", anova_n, " MLP Accuracy:", accuracy_score(y_test_anova, y_pred_mlp_anova))
# print("ANOVA MLP Classification Report:\n", classification_report(y_test_anova, y_pred_mlp_anova))

mlp_mi = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_mi.fit(X_train_mi, y_train_mi)
y_pred_mlp_mi = mlp_mi.predict(X_test_mi)
print("MI: ", mi_n, " MLP Accuracy:", accuracy_score(y_test_mi, y_pred_mlp_mi))


MLP Accuracy: 0.7910278250993753
ANOVA:  200  MLP Accuracy: 0.8258400378608614
MI:  200  MLP Accuracy: 0.5036910846110164
