# Import Library

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Import Data

In [14]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Eksplorasi Data

Memahami struktur dataset, distribusi data, dan potensi masalah seperti missing values atau ketidakseimbangan kelas.

In [15]:
print("Kolom di train_data:", train_data.columns.tolist())
print("Kolom di test_data:", test_data.columns.tolist())

print("\nNilai unik di HasCrCard:", train_data['HasCrCard'].value_counts())
print("Nilai unik di IsActiveMember:", train_data['IsActiveMember'].value_counts())
print("Nilai unik di Geography:", train_data['Geography'].value_counts())
print("Nilai unik di Gender:", train_data['Gender'].value_counts())
print("Nilai unik di Complain:", train_data['Complain'].value_counts())
print("Nilai unik di Card Type:", train_data['Card Type'].value_counts())

Kolom di train_data: ['ID', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'Satisfaction Score', 'Card Type', 'Point Earned', 'Exited']
Kolom di test_data: ['ID', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Complain', 'Satisfaction Score', 'Card Type', 'Point Earned']

Nilai unik di HasCrCard: HasCrCard
Yes    5700
No     2330
Name: count, dtype: int64
Nilai unik di IsActiveMember: IsActiveMember
Yes    3948
No     3680
Name: count, dtype: int64
Nilai unik di Geography: Geography
France     4006
Germany    2027
Spain      1997
Name: count, dtype: int64
Nilai unik di Gender: Gender
Male      4162
Female    3445
Name: count, dtype: int64
Nilai unik di Complain: Complain
No     6422
Yes    1608
Name: count, dtype: int64
Nilai unik di Card Type: Card Type
PLATINUM    2017
SILVER     

# Preprocessing Data

Membersihkan data, mengkonversi data ke format yang dapat digunakan oleh model, dan menormalkan fitur untuk meningkatkan performa model.

In [16]:
train_data = train_data.drop(['Surname'], axis=1)
test_data = test_data.drop(['Surname'], axis=1)

X = train_data.drop(['Exited'], axis=1)
y = train_data['Exited']

le = LabelEncoder()
categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Complain', 'Card Type']

for col in categorical_cols:
    X[col] = le.fit_transform(X[col].astype(str))
    test_data[col] = le.transform(test_data[col].astype(str))

numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Satisfaction Score', 'Point Earned']

X['Satisfaction Score'] = pd.to_numeric(X['Satisfaction Score'], errors='coerce')
X['Point Earned'] = pd.to_numeric(X['Point Earned'], errors='coerce')
test_data['Satisfaction Score'] = pd.to_numeric(test_data['Satisfaction Score'], errors='coerce')
test_data['Point Earned'] = pd.to_numeric(test_data['Point Earned'], errors='coerce')

X['Satisfaction Score'] = X['Satisfaction Score'].fillna(X['Satisfaction Score'].median())
X['Point Earned'] = X['Point Earned'].fillna(X['Point Earned'].median())
test_data['Satisfaction Score'] = test_data['Satisfaction Score'].fillna(X['Satisfaction Score'].median())
test_data['Point Earned'] = test_data['Point Earned'].fillna(X['Point Earned'].median())

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])

# Training Model Random Forest

Melatih model machine learning untuk memprediksi apakah pelanggan akan churn atau tidak dengan menggunakan model random forest.

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluasi Model

Mengukur performa model menggunakan metrik evaluasi untuk memastikan model cukup baik dalam memprediksi churn.

In [18]:
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

Accuracy: 0.9988
ROC-AUC Score: 0.9998


Hasil F1-Accuracy adalah 0.9988

# Prediksi dan Submission

Melakukan prediksi dan membuat file hasil

In [20]:
test_predictions = model.predict(test_data)
test_predictions_proba = model.predict_proba(test_data)[:, 1]

hasil = pd.DataFrame({
    'ID': test_data['ID'],
    'Exited': test_predictions
})

submission.to_csv('hasil.csv', index=False)