In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import project configuration
import sys
sys.path.append('..')
import config

# Set random seed
np.random.seed(config.RANDOM_SEED)

print("Libraries imported successfully")

Configuration loaded successfully!
Project root: C:\Users\hp\Desktop\gaf\Ghana-Armed-Forces-Personnel-Deployment-and-Attrition-Risk-Modeling
Random seed: 42
Target sample size: 1000 personnel
Libraries imported successfully


# Load Data


In [4]:
#load raw dataset 
data_path = config.RAW_DATA_DIR/config.PERSONNEL_DATA_FILE
df = pd.read_csv(data_path)

#convert data column 
df['contract_end_date'] = pd.to_datetime(df['contract_end_date'])

print(f"Data shape: {df.shape}")
df.head(3)

Data shape: (1000, 51)


Unnamed: 0,personnel_id,name,age,gender,service_branch,rank,MOS,years_of_service,contract_end_date,marital_status,...,promotion_eligible,peer_rating_score,civilian_job_offers,financial_stress_indicator,family_support_score,relocation_willingness,attrition_risk_score,attrition_risk,readiness_score,readiness_category
0,MIL00001,Allison Hill,35,Male,Army,Officer,Military Police,18,2026-09-22,Single,...,True,81.3,0.0,Low,95.6,Low,0.0,,62.0,Not Ready
1,MIL00002,Noah Rhodes,23,Female,Army,Junior,Cyber Operations,2,2025-03-01,Divorced,...,False,,0.0,Low,24.6,High,34.614493,LOW_RISK,70.9,Limited
2,MIL00003,Angie Henderson,29,Male,Army,Junior,Special Forces,5,2027-01-20,Married,...,True,73.8,0.0,Low,61.7,Medium,0.0,,67.3,Not Ready


# Temporal Features 

In [7]:
def create_temporal_features(df):
    """
    create time-based features with decay function
    """
    
    
    # Training recency score (exponential decay)
    # Score decreases as training becomes stale
    df['training_recency_score'] = np.exp(-df['days_since_last_training'] / config.TRAINING_RECENCY_DECAY)
    
    # Deployment recency score 
    # Maxed at 999 for never-deployed 
    df['deployment_recency_score'] = np.where(
        df['months_since_last_deployment'] < 999,
        np.exp(-df['months_since_last_deployment'] / config.DEPLOYMENT_RECENCY_DECAY),
        0  # Never deployed = 0 recency
    
    )
    
    # Contract pressure 
    df['contract_pressure'] = (df['months_until_contract_end'] < config.CONTRACT_PRESSURE_THRESHOLD).astype(int)
    
    # Contract urgency 
    # Higher values = more urgent 
    df['contract_urgency_score'] = (48 - df['months_until_contract_end'].clip(0, 48)) / 48
    
    # Time in current rank (estimated based on years of service)
    df['time_in_rank_estimate'] = np.where(
        df['rank'] == 'Junior',
        df['years_of_service'],
        np.where(
            df['rank'] == 'NCO',
            np.maximum(0, df['years_of_service'] - 5),
            np.maximum(0, df['years_of_service'] - 10)
        )
    )
    
    # Training gap indicator
    df['training_gap'] = (df['days_since_last_training'] > 365).astype(int)
    
    print(f"creating 6 temporal features")
    
    return df

df = create_temporal_features(df)

temporal_cols = ['training_recency_score', 'deployment_recency_score', 'contract_pressure', 
                 'contract_urgency_score', 'time_in_rank_estimate', 'training_gap']
df[['personnel_id'] + temporal_cols].head() 

    

creating 6 temporal features


Unnamed: 0,personnel_id,training_recency_score,deployment_recency_score,contract_pressure,contract_urgency_score,time_in_rank_estimate,training_gap
0,MIL00001,0.033934,0.20529,0,0.5625,8,1
1,MIL00002,0.454349,0.0,1,0.958333,2,0
2,MIL00003,0.088724,0.716531,0,0.479167,5,1
3,MIL00004,0.860708,0.311403,0,0.083333,5,0
4,MIL00005,0.787502,0.082085,0,0.479167,1,0


# Interaction Features 

In [9]:
def create_interaction_features(df):
    """
    Create interaction and ratio features.
    """
    
    # Training efficiency (hours per year of service)
    df['training_hours_per_year_service'] = df['total_training_hours'] / df['years_of_service'].replace(0, 1)
    
    # Health-Performance index (composite)
    df['health_performance_index'] = (df['health_index'] * df['performance_review_score']) / 100
    
    # Leave to deployment ratio
    df['leave_to_deployment_ratio'] = df['annual_leave_taken'] / df['months_deployed_last_3yrs'].replace(0, 1)
    
    # Skill decay factor (training recency × skills current)
    df['skill_decay_factor'] = df['training_recency_score'] * df['skills_current'].astype(int)
    
    # Fitness to age ratio (normalized)
    df['fitness_to_age_ratio'] = df['physical_fitness_score'] / df['age']
    
    # Deployments per year of service
    df['deployments_per_year'] = df['total_deployments'] / df['years_of_service'].replace(0, 1)
    
    # Commendations to discipline ratio
    df['commendations_to_discipline_ratio'] = df['commendations'] / df['disciplinary_actions'].replace(0, 1)
    
    # Training efficiency score
    df['training_efficiency'] = df['training_score_average'] * df['specialized_courses_completed']
    
    # Leave utilization rate
    df['leave_utilization_rate'] = df['annual_leave_taken'] / 30  # Max 30 days
    
    # Deployment load (months deployed / years of service)
    df['deployment_load'] = df['months_deployed_last_3yrs'] / (df['years_of_service'].replace(0, 1) * 12)
    
    print(f" Created 10 interaction features")
    return df

df = create_interaction_features(df)

# Display sample
interaction_cols = ['training_hours_per_year_service', 'health_performance_index', 
                   'skill_decay_factor', 'deployments_per_year', 'training_efficiency']
df[['personnel_id'] + interaction_cols].head()

 Created 10 interaction features


Unnamed: 0,personnel_id,training_hours_per_year_service,health_performance_index,skill_decay_factor,deployments_per_year,training_efficiency
0,MIL00001,68.111111,67.0878,0.0,0.333333,732.8
1,MIL00002,210.0,75.411,0.454349,0.0,239.4
2,MIL00003,109.6,67.65,0.0,0.2,0.0
3,MIL00004,131.2,75.1689,0.860708,0.4,0.0
4,MIL00005,0.0,64.213,0.787502,2.0,296.8


# Aggregated Features 

In [10]:
def create_aggregated_features(df):
    """
    Create aggregated features based on peer groups.
    """
    
    # Unit average readiness (by rank + MOS)
    unit_readiness = df.groupby(['rank', 'MOS'])['readiness_score'].transform('mean')
    df['unit_avg_readiness'] = unit_readiness
    
    # Relative readiness (vs unit average)
    df['relative_readiness'] = df['readiness_score'] - df['unit_avg_readiness']
    
    # Branch attrition rate (historical, based on current data)
    branch_attrition = df.groupby('service_branch')['attrition_risk'].apply(
        lambda x: (x == 'HIGH_RISK').sum() / len(x)
    )
    df['branch_attrition_rate'] = df['service_branch'].map(branch_attrition)
    
    # MOS retention rate (inverse of attrition)
    mos_attrition = df.groupby('MOS')['attrition_risk'].apply(
        lambda x: (x == 'HIGH_RISK').sum() / len(x)
    )
    df['mos_retention_rate'] = 1 - df['MOS'].map(mos_attrition)
    
    # Peer group performance (by rank + years of service bracket)
    df['service_bracket'] = pd.cut(df['years_of_service'], bins=[0, 5, 10, 15, 30], 
                                    labels=['0-5', '6-10', '11-15', '16+'])
    peer_performance = df.groupby(['rank', 'service_bracket'], observed=True)['performance_review_score'].transform('mean')
    df['peer_group_performance'] = peer_performance
    
    # Performance trajectory (relative to peer group)
    df['performance_trajectory'] = np.where(
        df['performance_review_score'] > df['peer_group_performance'],
        'Above Average',
        np.where(
            df['performance_review_score'] < df['peer_group_performance'] - 5,
            'Below Average',
            'Average'
        )
    )
    
    # Average health by rank
    rank_health = df.groupby('rank')['health_index'].transform('mean')
    df['rank_avg_health'] = rank_health
    
    print(f"Created 7 aggregated features")
    return df

df = create_aggregated_features(df)

# Display sample
agg_cols = ['unit_avg_readiness', 'relative_readiness', 'branch_attrition_rate', 
           'mos_retention_rate', 'peer_group_performance', 'performance_trajectory']
df[['personnel_id'] + agg_cols].head()

Created 7 aggregated features


Unnamed: 0,personnel_id,unit_avg_readiness,relative_readiness,branch_attrition_rate,mos_retention_rate,peer_group_performance,performance_trajectory
0,MIL00001,71.2625,-9.2625,0.063934,0.969697,81.711224,Average
1,MIL00002,70.295833,0.604167,0.063934,0.935065,82.145141,Average
2,MIL00003,70.162791,-2.862791,0.063934,0.962025,82.145141,Average
3,MIL00004,70.488235,2.911765,0.080214,0.966667,82.145141,Above Average
4,MIL00005,71.008,4.992,0.044335,0.916667,82.145141,Average


# Risk Indicator Features 

In [11]:
def create_risk_indicators(df):
    """
    Create composite risk indicator features.
    """    
    # Wellness red flags (health OR medical issues)
    df['wellness_red_flag'] = (
        (df['health_index'] < 70) | 
        (df['days_on_medical_leave'] > 30) | 
        (df['mental_health_status'] == 'Concern')
    ).astype(int)
    
    # High stress indicator (financial + family)
    df['high_stress_indicator'] = (
        (df['financial_stress_indicator'] == 'High') | 
        (df['family_support_score'] < 50)
    ).astype(int)
    
    # Deployment fatigue flag
    df['deployment_fatigue'] = (df['months_deployed_last_3yrs'] > 24).astype(int)
    
    # Low engagement indicator (poor performance + disciplinary issues)
    df['low_engagement'] = (
        (df['performance_review_score'] < 70) | 
        (df['disciplinary_actions'] > 1)
    ).astype(int)
    
    # Career stagnation (years in rank > threshold with no promotion eligibility)
    df['career_stagnation'] = (
        (df['time_in_rank_estimate'] > 5) & 
        (~df['promotion_eligible'])
    ).astype(int)
    
    # Retention risk composite score
    # Weighted sum of multiple risk factors
    df['retention_risk_composite'] = (
        df['contract_pressure'] * 3 +
        df['wellness_red_flag'] * 2 +
        df['high_stress_indicator'] * 2 +
        df['low_engagement'] * 1 +
        df['deployment_fatigue'] * 1 +
        (df['civilian_job_offers'] > 0).astype(int) * 2
    )
    
    print(f" Created 6 risk indicator features")
    return df

df = create_risk_indicators(df)

# Display sample
risk_cols = ['wellness_red_flag', 'high_stress_indicator', 'deployment_fatigue', 
            'low_engagement', 'career_stagnation', 'retention_risk_composite']
df[['personnel_id'] + risk_cols].head()

 Created 6 risk indicator features


Unnamed: 0,personnel_id,wellness_red_flag,high_stress_indicator,deployment_fatigue,low_engagement,career_stagnation,retention_risk_composite
0,MIL00001,0,0,0,0,0,0
1,MIL00002,0,1,0,0,0,5
2,MIL00003,0,0,1,0,0,1
3,MIL00004,0,0,1,0,0,3
4,MIL00005,0,1,1,0,0,3


# Feature Summary 

In [None]:
# Count engineered features
original_features = 54  # From data generation
current_features = len(df.columns)
engineered_features = current_features - original_features

print("\n" + "-" * 60)
print("FEATURE ENGINEERING SUMMARY")
print("-" * 60)
print(f"Original features: {original_features}")
print(f"Engineered features: {engineered_features}")
print(f"Total features: {current_features}")
print("\nEngineered Features by Category:")
print(f" Temporal: 6")
print(f" Interaction: 10")
print(f" Aggregated: 7")
print(f" Risk Indicators: 6")
print(f" Total Engineered: {6+10+7+6}")
print("-" * 60)

# List all engineered features
engineered_feature_list = [
    'training_recency_score', 'deployment_recency_score', 'contract_pressure',
    'contract_urgency_score', 'time_in_rank_estimate', 'training_gap',
    'training_hours_per_year_service', 'health_performance_index', 
    'leave_to_deployment_ratio', 'skill_decay_factor', 'fitness_to_age_ratio',
    'deployments_per_year', 'commendations_to_discipline_ratio', 
    'training_efficiency', 'leave_utilization_rate', 'deployment_load',
    'unit_avg_readiness', 'relative_readiness', 'branch_attrition_rate',
    'mos_retention_rate', 'peer_group_performance', 'performance_trajectory',
    'rank_avg_health',
    'wellness_red_flag', 'high_stress_indicator', 'deployment_fatigue',
    'low_engagement', 'career_stagnation', 'retention_risk_composite'
]

print("\nEngineered Features List:")
for i, feat in enumerate(engineered_feature_list, 1):
    print(f"  {i:2d}. {feat}")