In [None]:
# Preprocessing.ipynb

# Imports
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# ----- CONTROL FLAG -----
overwrite_files = True  # <<< Set this to TRUE because we MUST re-process the data

# Load data
df = pd.read_csv('full summary.csv')

# Convert dates
df['snapshot_date'] = pd.to_datetime(df['snapshot_date'])
df['hire_date'] = pd.to_datetime(df['hire_date'])
df['termination_date'] = pd.to_datetime(df['termination_date'], errors='coerce')

# Step 1: Sort data to correctly identify future events per employee
df = df.sort_values(by=['employee_id', 'snapshot_date']).reset_index(drop=True)

# Define the prediction horizon (e.g., 30 days)
PREDICTION_HORIZON_DAYS = 90 # HR usually wants to know about departures in the next month or quarter

# Initialize a new target column for future attrition
df['future_terminated_flag'] = 0

# Loop through employees to determine if they terminated within the horizon
for employee_id in df['employee_id'].unique():
    employee_data = df[df['employee_id'] == employee_id].copy()
    
    # Get the actual termination date for this employee from their records
    # If an employee terminated, their `terminated_flag` would be 1 and `termination_date` would be populated
    actual_termination_date = employee_data['termination_date'].dropna().min() # Get their *first* termination date if multiple records
    
    if pd.notna(actual_termination_date):
        # For each snapshot of this employee *before* their termination, check if they left within the horizon
        active_snapshots_before_term = employee_data[employee_data['snapshot_date'] < actual_termination_date].index
        
        for idx in active_snapshots_before_term:
            snapshot_date = df.loc[idx, 'snapshot_date']
            
            # If the actual termination date is within the prediction horizon of this snapshot date
            if (actual_termination_date - snapshot_date).days <= PREDICTION_HORIZON_DAYS:
                df.loc[idx, 'future_terminated_flag'] = 1
    # Handle cases where employee never terminated in the dataset (future_terminated_flag remains 0)

print(f"Number of future terminations identified (1s): {df['future_terminated_flag'].sum()}")
print(f"Original ever_terminated_flag (at snapshot) still present for comparison: {df['ever_terminated_flag'].sum()}")


# --- Feature engineering (add new features related to dates) ---
df['months_since_hire'] = (df['snapshot_date'] - df['hire_date']).dt.days // 30
# Add a feature for 'months_since_last_training' if it's not already well-captured

# Define columns to exclude from the model's features (X), but keep in the original dataframe for later export.
model_features_to_exclude = ['employee_id', 'snapshot_date', 'hire_date', 'termination_date', 'ever_terminated_flag', 'risk_of_exit_score', 'target_variable']

# Define target and features for the model
X = df.drop(columns=['future_terminated_flag'] + model_features_to_exclude)
y = df['future_terminated_flag']

# For documentation and audit purposes (adjust dropped_features.txt content if needed)
safe_features = X.columns.tolist() # These are the features going into the model
leakage_or_id_features = model_features_to_exclude # These are the columns excluded from the model's X

with open('safe_features.txt', 'w') as f:
    for feature in safe_features:
        f.write(f"{feature}\n")

with open('dropped_features.txt', 'w') as f:
    for feature in leakage_or_id_features:
        f.write(f"{feature}\n")

print("Safe features and dropped features documented.")     # <<< Use new target here

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Split data into training and testing sets BEFORE any preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipelines (rest of this section is largely the same)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Fit the preprocessor on the training data
preprocessor.fit(X_train)

# Transform both the training and testing data
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE ONLY to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

# --- SAVE ARTIFACTS ---
if overwrite_files:
    print("Saving new preprocessed files...")
    
    # Save preprocessor
    with open('preprocessor.pkl', 'wb') as f:
        pickle.dump(preprocessor, f)

    # Get feature names from the fitted preprocessor
    feature_names = preprocessor.get_feature_names_out()

    # Save the resampled training data
    pd.DataFrame(X_train_resampled, columns=feature_names).to_csv('X_train_resampled.csv', index=False)
    y_train_resampled.to_csv('y_train_resampled.csv', index=False)

    # Save the transformed testing data
    pd.DataFrame(X_test_transformed, columns=feature_names).to_csv('X_test_transformed.csv', index=False)
    y_test.to_csv('y_test.csv', index=False)

    # Save the FULL original test set data (all columns from 'df' after feature engineering/target creation)
    # This 'df' still contains all original columns + new features like 'months_since_hire' and 'future_terminated_flag'.
    original_test_set_full_data = df.loc[X_test.index].copy()
    original_test_set_full_data.to_csv('original_test_set_full_data.csv', index=False)
    print("Full original test set data saved for modeling export.")

    print("Preprocessing completed and files saved. NOW RUN MODELING.IPYNB")
else:
    print("overwrite_files=False --> Skipping file save. Using existing X_train/X_test CSV files.")
    