In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations

# Set pandas display options
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Load datasets
df = pd.read_csv("train.csv")
df.set_index('employee_id', inplace=True)

dfvalidation = pd.read_csv("test.csv")
dfvalidation.set_index('employee_id', inplace=True)
dfvalidation['is_promoted'] = 5  # Placeholder value

# Merge datasets for consistent preprocessing
dfmerged = pd.concat([df, dfvalidation])

# Fill missing values
dfmerged['previous_year_rating'].fillna(3.0, inplace=True)  # Fill with mode
dfmerged['education'].fillna('Bachelor\'s', inplace=True)   # Fill with mode

# One-hot encoding
dfmerged = pd.concat([
    dfmerged[['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 
              'KPIs_met >80%', 'awards_won?', 'avg_training_score', 'is_promoted']],
    pd.get_dummies(dfmerged['gender'], drop_first=True),
    pd.get_dummies(dfmerged['education'], drop_first=True),
    pd.get_dummies(dfmerged['recruitment_channel'], drop_first=True),
    pd.get_dummies(dfmerged['department'], drop_first=True),
    pd.get_dummies(dfmerged['region'], drop_first=True)
], axis=1)

# Feature interaction function (optional - uncomment if needed)
"""
def add_interactions(df):
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns) + ['_'.join(x) for x in combos]
    
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    noint_indices = [i for i, x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[noint_indices], axis=1)
    
    return df
"""

# Separate training and validation sets
X_validation = dfmerged[54808:]
X_train_full = dfmerged[:54808]

# Define features and target
y = X_train_full.is_promoted
X = X_train_full.drop(['is_promoted'], axis=1)

# Create train/test split for model evaluation
seed = 2
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

# Define and train XGBoost model with optimized parameters
model = XGBClassifier(
    learning_rate=0.1, 
    n_estimators=200, 
    max_depth=4, 
    min_child_weight=7, 
    gamma=0.4,
    nthread=4, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    objective='binary:logistic',
    scale_pos_weight=3,  # Handles class imbalance
    seed=29
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on test set
y_pred = model.predict(X_test)

# Evaluate model performance
print("Model Performance Metrics:")
print("-" * 30)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc_test = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
roc_auc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])

print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall: {recall:.4f} ({recall*100:.2f}%)")
print(f"F1 Score: {f1:.4f} ({f1*100:.2f}%)")
print(f"ROC AUC (Test): {roc_auc_test:.4f}")
print(f"ROC AUC (Train): {roc_auc_train:.4f}")

# Check for overfitting
print(f"\nOverfitting Check: difference between train and test ROC AUC: {abs(roc_auc_train - roc_auc_test):.4f}")

# Train on full training data
print("\nTraining final model on full training dataset...")
final_model = XGBClassifier(
    learning_rate=0.1, 
    n_estimators=200, 
    max_depth=4, 
    min_child_weight=7, 
    gamma=0.4,
    nthread=4, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    objective='binary:logistic',
    scale_pos_weight=3,
    seed=29
)
final_model.fit(X, y)

# Make predictions on validation set
X_val = X_validation.drop(['is_promoted'], axis=1)
y_val_pred = final_model.predict(X_val)

# Create submission file
submission = X_validation.copy()
submission['is_promoted'] = y_val_pred
submission = submission[['is_promoted']]
submission.reset_index(inplace=True)
print(f"Prediction distribution:\n{submission['is_promoted'].value_counts()}")

# Save submission
submission.to_csv("solution.csv", index=False)
print("Submission file saved as 'solution.csv'")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfmerged['previous_year_rating'].fillna(3.0, inplace=True)  # Fill with mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfmerged['education'].fillna('Bachelor\'s', inplace=True)   # Fill with mode


Model Performance Metrics:
------------------------------
Confusion Matrix:
[[9780  235]
 [ 541  406]]
Accuracy: 0.9292 (92.92%)
Precision: 0.6334 (63.34%)
Recall: 0.4287 (42.87%)
F1 Score: 0.5113 (51.13%)
ROC AUC (Test): 0.9093
ROC AUC (Train): 0.9304

Overfitting Check: difference between train and test ROC AUC: 0.0211

Training final model on full training dataset...
Prediction distribution:
is_promoted
0    22202
1     1288
Name: count, dtype: int64
Submission file saved as 'solution.csv'
