<a href="https://colab.research.google.com/github/andraekaaaa-beep/Analisis-Prediksi-Probabilitas-Attrition-SML-A/blob/main/Notebook/SML_Fix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,
                               ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.calibration import CalibratedClassifierCV

In [22]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

In [23]:
X_train_raw = train.drop(['Attrition', 'id'], axis=1)
y_train = train['Attrition']
X_test_raw = test.drop(['id'], axis=1)
test_ids = test['id'].copy()

# Remove constant columns
constant_cols = [col for col in X_train_raw.columns if X_train_raw[col].nunique() == 1]
if constant_cols:
   X_train_raw = X_train_raw.drop(constant_cols, axis=1)
   X_test_raw = X_test_raw.drop(constant_cols, axis=1)
   print(f"Removed: {constant_cols}")

print(f"initial Train: {X_train_raw.shape}, Test: {X_test_raw.shape}")

Removed: ['EmployeeCount', 'Over18', 'StandardHours']
initial Train: (1176, 31), Test: (294, 31)


In [24]:
# Encode categorical
for col in X_train_raw.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train_raw[col] = le.fit_transform(X_train_raw[col].astype(str))
    X_test_raw[col] = le.transform(X_test_raw[col].astype(str))

for col in X_train_raw.columns:
    if X_train_raw[col].dtype == 'bool':
        X_train_raw[col] = X_train_raw[col].astype(int)

for col in X_test_raw.columns:
    if X_test_raw[col].dtype == 'bool':
        X_test_raw[col] = X_test_raw[col].astype(int)

print("Data types after converting boolean to int:")
print(X_train_raw.dtypes.value_counts())

Data types after converting boolean to int:
int64    31
Name: count, dtype: int64


In [25]:
categorical_cols = ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
X_train = pd.get_dummies(X_train_raw, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test_raw, columns=categorical_cols, drop_first=True)

In [26]:
def create_features(df):
    df = df.copy()

    # --- REVISI 1: Jaminan Tipe Data Numerik untuk semua kolom yang dihitung/dikalikan ---
    # Ini mengatasi TypeError sebelumnya
    cols_to_numeric = [
        'YearsAtCompany', 'TotalWorkingYears', 'YearsInCurrentRole',
        'NumCompaniesWorked', 'YearsWithCurrManager', 'MonthlyIncome',
        'Age', 'YearsSinceLastPromotion', 'JobLevel', 'DistanceFromHome',
        'WorkLifeBalance', 'EnvironmentSatisfaction', 'JobSatisfaction',
        'RelationshipSatisfaction', 'PerformanceRating', 'OverTime' # Sertakan OverTime
    ]

    for col in cols_to_numeric:
        # Mengubah nilai non-numerik (jika ada) menjadi NaN, lalu mengisi NaN dengan 0
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # --- REVISI 2: Menggunakan Kolom OverTime yang Sudah Di-encode/Dibersihkan ---
    # Karena OverTime sudah di-encode sebelumnya (misalnya 0=No, 1=Yes), kita bisa langsung menggunakannya.
    df['OverTime_Binary'] = df['OverTime'] # Kolom ini sekarang sudah pasti numerik

    # --- Fitur Ratio dan Transformasi (AMAN) ---
    df['ExperienceRatio'] = df['YearsAtCompany'] / (df['TotalWorkingYears'] + 1)
    df['CurrentRoleRatio'] = df['YearsInCurrentRole'] / (df['YearsAtCompany'] + 1)
    df['JobHoppingRate'] = df['NumCompaniesWorked'] / (df['TotalWorkingYears'] + 1)
    df['ManagerStability'] = df['YearsWithCurrManager'] / (df['YearsAtCompany'] + 1)
    df['IncomePerYear'] = df['MonthlyIncome'] / (df['TotalWorkingYears'] + 1)

    df['IsYoung'] = (df['Age'] < 30).astype(int)
    df['ShortTenure'] = (df['YearsAtCompany'] < 2).astype(int)
    df['TimeWithoutPromotion'] = (df['YearsSinceLastPromotion'] > 3).astype(int)
    df['LowJobLevel'] = (df['JobLevel'] <= 1).astype(int)
    df['LongCommute'] = (df['DistanceFromHome'] > 15).astype(int)
    df['PoorWorkLife'] = (df['WorkLifeBalance'] <= 2).astype(int)

    # --- Fitur Komposit dan Interaksi (AMAN) ---
    satisfaction_cols = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction', 'WorkLifeBalance']
    df['AvgSatisfaction'] = df[satisfaction_cols].mean(axis=1)
    df['LowSatisfaction'] = (df['AvgSatisfaction'] < 2).astype(int)

    # Menggunakan OverTime_Binary yang sudah numerik
    df['HighRisk_Flag'] = ((df['OverTime_Binary'] == 1) & (df['JobSatisfaction'] <= 2) & (df['WorkLifeBalance'] <= 2)).astype(int)
    df['Career_Stagnation'] = ((df['YearsSinceLastPromotion'] > 5) & (df['YearsAtCompany'] > 10)).astype(int)

    # Operasi Perkalian (Sekarang Aman)
    df['Income_JobLevel'] = df['MonthlyIncome'] * df['JobLevel']
    df['Age_Experience'] = df['Age'] * df['TotalWorkingYears']
    df['Satisfaction_Performance'] = df['AvgSatisfaction'] * df['PerformanceRating']

    # Attrition Risk Score (Sekarang Aman)
    df['AttritionRiskScore'] = (df['LowSatisfaction'] * 2.5 + df['OverTime_Binary'] * 2.0 + df['LongCommute'] * 1.0 +
                                 df['TimeWithoutPromotion'] * 2.0 + df['ShortTenure'] * 1.5 + df['JobHoppingRate'] * 12 +
                                 df['PoorWorkLife'] * 1.5 + df['LowJobLevel'] * 0.5 + df['HighRisk_Flag'] * 3.0 + df['Career_Stagnation'] * 2.0)

    return df

X_eng = create_features(X_train_raw)
X_test_eng = create_features(X_test_raw)
print(f"Features: {X_train_raw.shape[1]} -> {X_eng.shape[1]}")

Features: 31 -> 51


In [27]:
rf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
rf.fit(X_eng, y_train)

feat_imp = pd.DataFrame({
    'feature': X_eng.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTOP 20:")
print(feat_imp.head(20).to_string(index=False))
N_FEAT = min(45, len(feat_imp))
selected = feat_imp.head(N_FEAT)['feature'].tolist()
X_sel = X_eng[selected].copy()
X_test_sel = X_test_eng[selected].copy()
print(f"\nSelected: {N_FEAT} features")
# Plot
fig, ax = plt.subplots(figsize=(12, 10))
top = feat_imp.head(30)
ax.barh(range(len(top)), top['importance'], color='steelblue')
ax.set_yticks(range(len(top)))
ax.set_yticklabels(top['feature'], fontsize=9)
ax.set_xlabel('Importance')
ax.set_title('Feature Importances', fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig('01_features.png', dpi=300, bbox_inches='tight')
plt.close()


TOP 20:
                 feature  importance
      AttritionRiskScore    0.083664
          Age_Experience    0.055707
         Income_JobLevel    0.045222
          JobHoppingRate    0.044507
           MonthlyIncome    0.044037
                     Age    0.039775
       TotalWorkingYears    0.034263
           IncomePerYear    0.031106
               DailyRate    0.030253
          EmployeeNumber    0.029076
         AvgSatisfaction    0.028395
              HourlyRate    0.027371
             MonthlyRate    0.026661
        DistanceFromHome    0.026078
         OverTime_Binary    0.025226
        StockOptionLevel    0.023814
Satisfaction_Performance    0.023747
                OverTime    0.021945
 EnvironmentSatisfaction    0.019950
         ExperienceRatio    0.019476

Selected: 45 features


In [28]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
#from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, classification_report, roc_curve
import xgboost as xgb
import lightgbm as lgb

class_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
print(f"Calculated Scale Pos Weight (Ratio 0/1): {class_ratio:.2f}")

# Split data untuk validation
X_temp, X_val, y_temp, y_val = train_test_split(
    X_sel, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print("\nData Split Status (ASLI, tanpa SMOTE):")
print(f"X_temp (Training untuk Tuning): {X_temp.shape}")
print(f"X_val (Validation Set): {X_val.shape}")

# Definisikan model dengan menggunakan scale_pos_weight
models = {
    'XGBoost': xgb.XGBClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1,
        scale_pos_weight=class_ratio # <-- PENGGANTI SMOTE
    ),
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1,
        scale_pos_weight=class_ratio # <-- PENGGANTI SMOTE
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=200, max_depth=8, random_state=42, n_jobs=-1,
        class_weight='balanced' # <-- PENGGANTI SMOTE untuk RF
    ),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = {}

for name, model in models.items():
    # Model dilatih dan diuji pada data training asli (X_temp, y_temp)
    cv_score = cross_val_score(model, X_temp, y_temp, cv=cv, scoring='roc_auc', n_jobs=-1)
    cv_scores[name] = cv_score
    print(f"{name} (CV AUC REAL): {cv_score.mean():.4f} (+/- {cv_score.std() * 2:.4f})")

Calculated Scale Pos Weight (Ratio 0/1): 5.19

Data Split Status (ASLI, tanpa SMOTE):
X_temp (Training untuk Tuning): (940, 45)
X_val (Validation Set): (236, 45)
XGBoost (CV AUC REAL): 0.7882 (+/- 0.0655)
LightGBM (CV AUC REAL): 0.7928 (+/- 0.0610)
RandomForest (CV AUC REAL): 0.7812 (+/- 0.0858)


In [29]:
from sklearn.model_selection import RandomizedSearchCV

# Tuning untuk XGBoost (biasanya performa terbaik)
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0.1, 1.5, 10],
}

xgb_model = xgb.XGBClassifier(
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=class_ratio
)

random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1,
)

print("Tuning XGBoost pada data ASLI...")
# REVISI 4: Menggunakan data asli (X_temp, y_temp)
random_search.fit(X_temp, y_temp)

print("Best parameters (melengkapi fixed scale_pos_weight):", random_search.best_params_)
print("Best CV AUC (pada data ASLI):", random_search.best_score_)

best_model = random_search.best_estimator_

Tuning XGBoost pada data ASLI...
Best parameters (melengkapi fixed scale_pos_weight): {'subsample': 0.8, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.2, 'colsample_bytree': 0.9}
Best CV AUC (pada data ASLI): 0.8184435607410279


In [30]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

print("--- 1. TUNING LIGHTGBM ---")

# Definisikan search space untuk LGBM
lgbm_param_dist = {
    'num_leaves': [20, 31, 50, 70, 100],
    'max_depth': [-1, 4, 6, 8, 10],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500, 1000],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1],
}


# Set scale_pos_weight ke rasio kelas langsung di model
lgbm_model = lgb.LGBMClassifier(
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=class_ratio
)

lgbm_random_search = RandomizedSearchCV(
    lgbm_model,
    lgbm_param_dist,
    n_iter=100, # Ulangi 30 kali
    cv=10,      # 5-fold CV untuk hasil yang lebih stabil
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)

# Kita pakai X_sel/y_train untuk mencari parameter terbaik dari seluruh data yang tersedia.
lgbm_random_search.fit(X_sel, y_train)

best_lgbm_params = lgbm_random_search.best_params_
print("LightGBM Best Parameters:", best_lgbm_params)
print("LightGBM Best CV AUC:", lgbm_random_search.best_score_)

--- 1. TUNING LIGHTGBM ---
[LightGBM] [Info] Number of positive: 190, number of negative: 986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2763
[LightGBM] [Info] Number of data points in the train set: 1176, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.161565 -> initscore=-1.646632
[LightGBM] [Info] Start training from score -1.646632
LightGBM Best Parameters: {'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0.1, 'num_leaves': 70, 'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
LightGBM Best CV AUC: 0.8136211741474899


In [31]:
from sklearn.ensemble import VotingClassifier

print("\n--- 2. FINAL ENSEMBLE ---")

# Definisikan model akhir menggunakan parameter terbaik dan class weight

xgb_tuned = xgb.XGBClassifier(
    **random_search.best_params_,
    random_state=42, n_jobs=-1,
    scale_pos_weight=class_ratio
)

lgbm_tuned = lgb.LGBMClassifier(
    **best_lgbm_params,
    random_state=42, n_jobs=-1,
    scale_pos_weight=class_ratio
)

rf_balanced = RandomForestClassifier(
    n_estimators=200, max_depth=8, random_state=42, n_jobs=-1,
    class_weight='balanced'
)

# Ensemble dengan soft voting
#ensemble_final = VotingClassifier(
 #   estimators=[(name, model) for name, model in final_models_tuned.items()],
  #  voting='soft',
   # n_jobs=-1
#)

ensemble_final = VotingClassifier(
    estimators=[
        ('xgb', xgb_tuned),
        ('lgb', lgbm_tuned),
        ('rf', rf_balanced)
    ],
    voting='soft',
    # Sesuaikan bobot ini berdasarkan hasil CV AUC Anda yang paling jujur.
    # Jika LGBM terbaik, beri bobot lebih tinggi. Contoh: [1.5, 2.5, 1.0]
    weights=[1, 2, 1],
    n_jobs=-1
)

print("Training final ensemble model on all data...")

# Latih ENSEMBLE pada SELURUH data training ASLI (X_sel, y_train)
ensemble_final.fit(X_sel, y_train)

# Predict on test set
test_preds_ensemble = ensemble_final.predict_proba(X_test_sel)[:, 1]
print("Ensemble Training Selesai.")


--- 2. FINAL ENSEMBLE ---
Training final ensemble model on all data...
Ensemble Training Selesai.


In [32]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier

# --- Asumsi: xgb_tuned, lgbm_tuned, X_sel, y_train, X_test_sel, class_ratio, test_ids sudah terdefinisi ---

print("--- 1. PELATIHAN MODEL TERBAIK PADA SELURUH DATA ---")

# 1. Pelatihan Model XGBoost (Model 1)
print("Training XGBoost...")
xgb_tuned.fit(X_sel, y_train)
xgb_preds = xgb_tuned.predict_proba(X_test_sel)[:, 1]

# 2. Pelatihan Model LightGBM (Model 2)
print("Training LightGBM...")
lgbm_tuned.fit(X_sel, y_train)
lgbm_preds = lgbm_tuned.predict_proba(X_test_sel)[:, 1]


print("\n--- 2. WEIGHTED AVERAGING & KALIBRASI ---")

# 3. Weighted Average (Averaging Berbobot)
# Beri bobot lebih tinggi pada model yang memiliki skor CV/Validation terbaik.

WEIGHT_LGBM = 0.83
WEIGHT_XGB = 0.17

ensemble_preds_raw = WEIGHT_LGBM * lgbm_preds + WEIGHT_XGB * xgb_preds

# 4. Kalibrasi (Kunci untuk skor Leaderboard)
expected_rate = y_train.mean()
current_mean = ensemble_preds_raw.mean()

calibrated_preds_final = ensemble_preds_raw * (expected_rate / current_mean)
calibrated_preds_final = np.clip(calibrated_preds_final, 0, 1)

# 5. Submission
submission_avg = pd.DataFrame({
    'id': test_ids,
    'Attrition': calibrated_preds_final
})

submission_avg.to_csv('submission_last1.csv', index=False)

print("✅ Submission file created: submission_weighted_average_final.csv")
print(f"Final Mean Prediction (Calibrated): {calibrated_preds_final.mean():.4f}")

--- 1. PELATIHAN MODEL TERBAIK PADA SELURUH DATA ---
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Number of positive: 190, number of negative: 986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2763
[LightGBM] [Info] Number of data points in the train set: 1176, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.161565 -> initscore=-1.646632
[LightGBM] [Info] Start training from score -1.646632

--- 2. WEIGHTED AVERAGING & KALIBRASI ---
✅ Submission file created: submission_weighted_average_final.csv
Final Mean Prediction (Calibrated): 0.1437
