In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

#machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, 
    r2_score, mean_absolute_percentage_error
)
import xgboost as xgb
import lightgbm as lgb
import shap

import sys
sys.path.append('..')
import config

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seed
np.random.seed(config.RANDOM_SEED)

print("Libraries imported successfully")

Configuration loaded successfully!
Project root: C:\Users\hp\Desktop\gaf\Ghana-Armed-Forces-Personnel-Deployment-and-Attrition-Risk-Modeling
Random seed: 42
Target sample size: 1000 personnel
Libraries imported successfully


# Load Dataset

In [3]:
#load dataset 
data_path = config.PROCESSED_DATA_DIR/config.FEATURES_ENGINEERED_FILE
df = pd.read_csv(data_path)

print(f"Data Shape: {df.shape}")
print("Readiness score statistics:")
print(df['readiness_score'].describe())

Data Shape: (1000, 75)
Readiness score statistics:
count    1000.000000
mean       71.153700
std         9.110339
min        42.400000
25%        64.900000
50%        71.300000
75%        77.525000
max        99.300000
Name: readiness_score, dtype: float64


# Data Preparation

In [7]:
#seperate features and target 
X = df.drop(['attrition_risk', 'readiness_score'], axis=1)
y = df['readiness_score']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

#encode categorical features 
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\nCategorical features: {len(categorical_features)}")

#one-hot encode
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
print(f"\nFeatures after encoding: {X_encoded.shape}")

Features shape: (1000, 73)
Target shape: (1000,)

Categorical features: 12

Features after encoding: (1000, 100)


In [9]:
#train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y,
    test_size=config.TEST_SIZE,
    random_state=config.RANDOM_SEED
)

print("Dataset splits:")
print(f" Training: {X_train.shape[0]} samples ({X_train.shape[0]/len(df):.1%})")
print(f" Test: {X_test.shape[0]} samples ({X_test.shape[0]/len(df):.1%})")

Dataset splits:
 Training: 800 samples (80.0%)
 Test: 200 samples (20.0%)


In [10]:
#feature scaling 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#store feature names 
feature_names = X_encoded.columns.tolist()

print(f"Features scaled")
print()