In [5]:
!pip install lightgbm -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.1-py3-none-any.whl.metadata (8.9 kB)
Collecting sklearn-compat<0.2,>=0.1.5 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.5-py3-none-any.whl.metadata (20 kB)
Downloading imbalanced_learn-0.14.1-py3-none-any.whl (235 kB)
   ---------------------------------------- 0.0/235.4 kB ? eta -:--:--
   ------ --------------------------------- 41.0/235.4 kB 1.9 MB/s eta 0:00:01
   -------------------------- ------------- 153.6/235.4 kB 2.3 MB/s eta 0:00:01
   ---------------------------------------- 235.4/235.4 kB 2.0 MB/s eta 0:00:00
Downloading sklearn_compat-0.1.5-py3-none-any.whl (20 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.11.0
    Uninstalling imbalanced-learn-0.11.0:
      Successfully uninstalled imbalanced-learn-0.11.0
Successfully installed imbalanced-learn-0.14.1 sklearn-compat-0.1.5


In [6]:
#import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import joblib
import warnings
warnings.filterwarnings("ignore")

#Ml libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    roc_auc_score, roc_curve, precision_recall_curve,
    f1_score, recall_score, precision_score
) 
import xgboost as xgb
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
import shap 


import sys
sys.path.append('..')
import config


# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seed
np.random.seed(config.RANDOM_SEED)

print("Libraries imported successfully")
print(f"XGBoost version: {xgb.__version__}")
print(f"LightGBM version: {lgb.__version__}")

Configuration loaded successfully!
Project root: C:\Users\hp\Desktop\gaf\Ghana-Armed-Forces-Personnel-Deployment-and-Attrition-Risk-Modeling
Random seed: 42
Target sample size: 1000 personnel
Libraries imported successfully
XGBoost version: 3.0.0
LightGBM version: 4.6.0


# Load Dataset 

In [10]:
#load dataset
data_path = config.PROCESSED_DATA_DIR/config.FEATURES_ENGINEERED_FILE
df = pd.read_csv(data_path)

print(f"Data loaded: {df.shape}")
print(f"\nTartfet distribution: ")
print(df['attrition_risk'].value_counts())
print(f"\nTarget distribution (%)")
print(df["attrition_risk"].value_counts(normalize=True).apply(lambda x: f"{x:.1%}"))

Data loaded: (1000, 75)

Tartfet distribution: 
attrition_risk
LOW_RISK       726
MEDIUM_RISK    211
HIGH_RISK       63
Name: count, dtype: int64

Target distribution (%)
attrition_risk
LOW_RISK       72.6%
MEDIUM_RISK    21.1%
HIGH_RISK       6.3%
Name: proportion, dtype: object


# Data Preparation 

In [16]:
#sepetate features and targets
X = df.drop(['attrition_risk', 'readiness_score'], axis=1)
y = df['attrition_risk']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

#encode categorical features 
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\nCategorical features to encode: {len(categorical_features)}")
print(categorical_features)

#one-hot encode categorical features 
X_encoded = pd.get_dummies(X, columns= categorical_features, drop_first=True)

print(f"\nFeatures after encoding: {X_encoded.shape}")
print(f"Added {X_encoded.shape[1] - X.shape[1]} dummy variables")
      
      

Features shape: (1000, 73)
Target shape: (1000,)

Categorical features to encode: 12
['gender', 'service_branch', 'rank', 'MOS', 'marital_status', 'education_level', 'mental_health_status', 'deployment_type', 'combat_exposure_level', 'financial_stress_indicator', 'relocation_willingness', 'performance_trajectory']

Features after encoding: (1000, 100)
Added 27 dummy variables


In [19]:
#encode target variables (string numeric)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#store mapping 
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Target encoding:")
for label, code in label_mapping.items():
    print(f"{label}: {code}")
    
#HIGH_RISK is encoded as 0 for easier recall calculation 
#remap 

if label_mapping['HIGH_RISK'] != 0:
    print("\n⚠️  Remapping labels so HIGH_RISK = 0 (for recall optimization)")
    label_encoder.classes_ = np.array(['HIGH_RISK', 'LOW_RISK', 'MEDIUM_RISK'])
    y_encoded = label_encoder.transform(y)
    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print("New mapping:")
    for label, code in label_mapping.items():
        print(f"  {label}: {code}")

Target encoding:
HIGH_RISK: 0
LOW_RISK: 1
MEDIUM_RISK: 2


In [20]:
#train/val/test split (60/20/20 stratified)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_encoded, y_encoded, 
    test_size=config.TEST_SIZE, 
    stratify=y_encoded, 
    random_state=config.RANDOM_SEED
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, 
    test_size=config.VAL_SIZE, 
    stratify=y_temp, 
    random_state=config.RANDOM_SEED
)

print("Dataset splits:")
print(f"  Training: {X_train.shape[0]} samples ({X_train.shape[0]/len(df):.1%})")
print(f"  Validation: {X_val.shape[0]} samples ({X_val.shape[0]/len(df):.1%})")
print(f"  Test: {X_test.shape[0]} samples ({X_test.shape[0]/len(df):.1%})")

# Check stratification
print("\nClass distribution in splits:")
print(f"  Training: {np.bincount(y_train)}")
print(f"  Validation: {np.bincount(y_val)}")
print(f"  Test: {np.bincount(y_test)}")

Dataset splits:
  Training: 600 samples (60.0%)
  Validation: 200 samples (20.0%)
  Test: 200 samples (20.0%)

Class distribution in splits:
  Training: [ 37 436 127]
  Validation: [ 13 145  42]
  Test: [ 13 145  42]


# Handling Class Imbalance with SMOTE 

In [21]:
print("Class distribution before smote")
print(f"class 0 (HIGH_RISK): {np.sum(y_train == 0)}")
print(f" class 1 (HIGH_RISK): {np.sum(y_train == 1)}")
print(f" class 2 (MEDIUM_RISK): {np.sum(y_train == 2)}")

#appky smote to balance classes 
smote = SMOTE(random_state=config.RANDOM_SEED)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("\nClass distribution AFTER SMOTE:")
print(f"  Class 0 (HIGH_RISK): {np.sum(y_train_balanced == 0)}")
print(f"  Class 1 (LOW_RISK): {np.sum(y_train_balanced == 1)}")
print(f"  Class 2 (MEDIUM_RISK): {np.sum(y_train_balanced == 2)}")
print(f"\nTraining set increased from {len(y_train)} to {len(y_train_balanced)} samples")

Class distribution before smote
class 0 (HIGH_RISK): 37
 class 1 (HIGH_RISK): 436
 class 2 (MEDIUM_RISK): 127

Class distribution AFTER SMOTE:
  Class 0 (HIGH_RISK): 436
  Class 1 (LOW_RISK): 436
  Class 2 (MEDIUM_RISK): 436

Training set increased from 600 to 1308 samples


# Feature Scaling 

In [23]:
#standardize features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#store feature names 
feature_names = X_encoded.columns.tolist()

print(f"Features scaled using standardscaler")
print(f"Total features: {len(feature_names)}")

Features scaled using standardscaler
Total features: 100
