# ****Exploratory  Data****

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
df = pd.read_csv('/kaggle/input/netflix-customer-churn-dataset/netflix_customer_churn.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df['churned'].value_counts()

In [None]:
df['subscription_type'].value_counts()

In [None]:
df['device'].value_counts()

In [None]:
sns.set_theme(style = "whitegrid", palette = "pastel")
plt.figure(figsize=(18,5))

#plot for churn
plt.subplot(1,3,1)
sns.countplot(x = 'churned', data=df)
plt.title('Churn Distribution')
plt.xlabel('Churned')
plt.ylabel('Count')

#plot for subscription_type
plt.subplot(1,3,2)
sns.countplot(x = 'subscription_type', data=df)
plt.title('Subscription Type Distribution')
plt.xlabel('Subscription Type')
plt.ylabel('Count')

#plot for device
plt.subplot(1,3,3)
sns.countplot(x = 'device', data=df)
plt.title('Distribution Type Device')
plt.xlabel('Device')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
sns.set_theme(style="whitegrid", palette="viridis")
plt.figure(figsize=(8, 6))
sns.boxplot(x='churned', y='age', data=df)
plt.title('Age distribution', fontsize=14)
plt.xlabel('Churned')
plt.ylabel('age')
plt.show()

# Preprocessing

In [None]:
X = df.drop(['customer_id', 'churned'], axis = 1)
y = df['churned']

categorical_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

model_columns = X_encoded.columns
joblib.dump(model_columns, 'model_columns.pkl')

X_encoded.head()

In [None]:
X_encoded.tail()

# Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [None]:
#spliting data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
#Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
joblib.dump(rf_model, 'random_forest.pkl')
y_pred_rf = rf_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
importances = pd.Series(rf_model.feature_importances_, index=X_encoded.columns)
plt.figure(figsize=(10, 6))
importances.nlargest(10).plot(kind='barh').invert_yaxis() 
plt.title("Top 10 Feature")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

In [None]:
#xgb model
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, 'xgboost_model.pkl')
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_xgb))

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', xticklabels=['Bertahan', 'Churn'], yticklabels=['Bertahan', 'Churn'])
plt.title('Confusion Matrix - Random Forest', fontsize=14)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.show()

In [None]:
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Greens', xticklabels=['Bertahan', 'Churn'], yticklabels=['Bertahan', 'Churn'])
plt.title('Confusion Matrix - XGBoost', fontsize=14)
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.show()

In [None]:
plt.figure(figsize=(8, 7))

#  Plot untuk Random Forest 
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:,1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.2f})')

#  Plot for XGBoost 
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_model.predict_proba(X_test)[:,1])
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
plt.plot(fpr_xgb, tpr_xgb, color='green', lw=2, label=f'XGBoost (AUC = {roc_auc_xgb:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.title('Perbandingan ROC Curve', fontsize=16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

# Testing

In [None]:
def get_prediction(input_df, model, columns):
    input_processed = pd.get_dummies(input_df)
    input_final = input_processed.reindex(columns=columns, fill_value=0)
    prediction = model.predict(input_final)[0]
    probability = model.predict_proba(input_final)[0]
    return prediction, probability

model_to_test = joblib.load('xgboost_model.pkl')
model_columns = joblib.load('model_columns.pkl')
df = pd.read_csv('/kaggle/input/netflix-customer-churn-dataset/netflix_customer_churn.csv')

sample_index = 100
sample_data = df.iloc[[sample_index]]
X_sample = sample_data.drop(['customer_id', 'churned'], axis=1)
y_actual = sample_data['churned'].iloc[0]

pred, proba = get_prediction(X_sample, model_to_test, model_columns)

print(f"Index {sample_index}: Actual({y_actual})")
print(f"Prediksi({pred})")