# Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV



# Relevant Data

In [6]:
file_path = "/Users/adammcmahan/Desktop/550_milestone3.csv"
data = pd.read_csv(file_path)

print(data.head())
department_list = data["Department"].unique().tolist()
print(department_list)

   Employee_ID        Department  Gender  Age  Years_At_Company  \
0            1                IT    Male   55                 2   
1            2           Finance    Male   29                 0   
2            3           Finance    Male   55                 8   
3            4  Customer Support  Female   48                 7   
4            5       Engineering  Female   36                 3   

   Performance_Score  Monthly_Salary  Work_Hours_Per_Week  Projects_Handled  \
0                  5            6750                   33                32   
1                  5            7500                   34                34   
2                  3            5850                   37                27   
3                  2            4800                   52                10   
4                  2            4800                   38                11   

   Overtime_Hours  Sick_Days  Remote_Work_Frequency  Team_Size  \
0              22          2                      0     

In [8]:
# Address the remaining categories
# Gender to binary, change float to int.
data["Gender"] = data["Gender"].map({"Male": 0, "Female": 1})

# Convert Department to numerical labels
data["Department"], _ = pd.factorize(data["Department"])

# Remove "Date_Only" and "Resigned" column
data = data.drop(columns=["Date_Only"])
data = data.drop(columns=["Resigned"])
data = data.drop(columns=["Hire_Year"])

print(data.head())

   Employee_ID  Department  Gender  Age  Years_At_Company  Performance_Score  \
0            1           0     0.0   55                 2                  5   
1            2           1     0.0   29                 0                  5   
2            3           1     0.0   55                 8                  3   
3            4           2     1.0   48                 7                  2   
4            5           3     1.0   36                 3                  2   

   Monthly_Salary  Work_Hours_Per_Week  Projects_Handled  Overtime_Hours  \
0            6750                   33                32              22   
1            7500                   34                34              13   
2            5850                   37                27               6   
3            4800                   52                10              28   
4            4800                   38                11              29   

   Sick_Days  Remote_Work_Frequency  Team_Size  Training_Hours

# Modeling: Regression

In [11]:
# Training Data (all data)

# Feature-target split
X = data.drop(columns=["Monthly_Salary"])
y = data["Monthly_Salary"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train model
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MSE: 1416556.7033027778
R² Score: 0.24746778354581533


In [12]:
# Factors that impact salary the most

importances = rf_model.feature_importances_
feature_importance = dict(zip(X_train.columns, importances))

print("Feature Importance:", feature_importance)

Feature Importance: {'Employee_ID': 0.10289719713820542, 'Department': 0.04258006110660982, 'Gender': 0.015207377598981808, 'Age': 0.06864675730291722, 'Years_At_Company': 0.04476277591361344, 'Performance_Score': 0.2581393864889729, 'Work_Hours_Per_Week': 0.06618960598112313, 'Projects_Handled': 0.07364633140758893, 'Overtime_Hours': 0.06551158904363474, 'Sick_Days': 0.05153111299524283, 'Remote_Work_Frequency': 0.02959773941179642, 'Team_Size': 0.05692583675983975, 'Training_Hours': 0.08172501842544258, 'Promotions': 0.018155049735733283, 'Employee_Satisfaction_Score': 0.024484160690297848}


In [13]:
# Training data (Selected Data)

# Define features by selecting only the specified columns
selected_features = ["Department", "Performance_Score", "Projects_Handled", "Training_Hours"]
X = data[selected_features]  # Keep only the chosen predictors
y = data["Monthly_Salary"]  # Target variable

# Verify updated feature set
print(X.columns.tolist())

['Department', 'Performance_Score', 'Projects_Handled', 'Training_Hours']


In [14]:
# Training
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

In [15]:
# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# Calculate R^2
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

MSE: 1724453.333248354
R² Score: 0.08390064018226884


In [16]:
# Extract and display feature importance
importances = rf_model.feature_importances_
feature_importance = dict(zip(selected_features, importances))

print("Feature Importance:", feature_importance)

Feature Importance: {'Department': 0.15250451046169453, 'Performance_Score': 0.27760517889079434, 'Projects_Handled': 0.261942930681219, 'Training_Hours': 0.30794737996629207}


In [17]:
# Regression model 3:

# Define new selected features
selected_features = [
    "Department", "Performance_Score", "Projects_Handled", "Training_Hours", 
    "Promotions", "Years_At_Company", "Work_Hours_Per_Week", 
    "Overtime_Hours", "Sick_Days", "Employee_Satisfaction_Score", "Team_Size"
]
X = data[selected_features]
y = data["Monthly_Salary"]

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict & Evaluate
from sklearn.metrics import mean_squared_error, r2_score
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display Results
print("MSE:", mse)
print("R² Score:", r2)

# Feature Importance
importances = rf_model.feature_importances_
feature_importance = dict(zip(selected_features, importances))
print("Feature Importance:", feature_importance)

MSE: 1424152.3767
R² Score: 0.2434326546139861
Feature Importance: {'Department': 0.059425758586833834, 'Performance_Score': 0.2581384438619459, 'Projects_Handled': 0.1040580948149528, 'Training_Hours': 0.1184209904281996, 'Promotions': 0.025936490772670975, 'Years_At_Company': 0.06314155517227071, 'Work_Hours_Per_Week': 0.09285650367706476, 'Overtime_Hours': 0.09119197309687702, 'Sick_Days': 0.07218652508175795, 'Employee_Satisfaction_Score': 0.03521461445805513, 'Team_Size': 0.0794290500493713}


In [18]:
# Regression model 4

# Define selected features (removing low-impact columns)
selected_features = [
    "Department", "Performance_Score", "Projects_Handled", "Training_Hours", 
    "Years_At_Company", "Work_Hours_Per_Week", "Overtime_Hours", "Sick_Days", "Team_Size"
]
X = data[selected_features]
y = data["Monthly_Salary"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model with hyperparameter tuning
rf_model = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R² Score:", r2)

# Feature importance analysis
importances = rf_model.feature_importances_
feature_importance = dict(zip(selected_features, importances))

print("Feature Importance:", feature_importance)

MSE: 1398907.9711831023
R² Score: 0.2568435038884318
Feature Importance: {'Department': 0.04451975593903744, 'Performance_Score': 0.47552394602215786, 'Projects_Handled': 0.08139008309451497, 'Training_Hours': 0.09414396421512732, 'Years_At_Company': 0.04764607542246087, 'Work_Hours_Per_Week': 0.07129504861667636, 'Overtime_Hours': 0.07082282019703288, 'Sick_Days': 0.05436131928604714, 'Team_Size': 0.060296987206945075}


In [19]:
# Creating a new column to reflect yearly salary. 

data["Yearly_Salary"] = data["Monthly_Salary"] * 12

In [20]:
# regressions with new salary (repeating test 4)

y = data["Yearly_Salary"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model with hyperparameter tuning
rf_model = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R² Score:", r2)

# Feature importance analysis
importances = rf_model.feature_importances_
feature_importance = dict(zip(selected_features, importances))

print("Feature Importance:", feature_importance)

MSE: 201476069.45414916
R² Score: 0.25672057483501076
Feature Importance: {'Department': 0.0445577376273967, 'Performance_Score': 0.47551793029280304, 'Projects_Handled': 0.08135554022962842, 'Training_Hours': 0.0941674980456601, 'Years_At_Company': 0.047643262573201124, 'Work_Hours_Per_Week': 0.07126246567591284, 'Overtime_Hours': 0.07081212854037465, 'Sick_Days': 0.05438880284609155, 'Team_Size': 0.060294634168931605}


# Modeling: Hyperparameter Tuning

In [32]:
# interactions
# Create interaction features for skill impact
data["Perf_Training"] = data["Performance_Score"] * data["Training_Hours"]
data["Perf_Projects"] = data["Performance_Score"] * data["Projects_Handled"]

# Define selected features (excluding original columns now combined)
selected_features = [
    "Department", "Years_At_Company", "Work_Hours_Per_Week", 
    "Overtime_Hours", "Sick_Days", "Team_Size", "Perf_Training", "Perf_Projects"
]

X = data[selected_features]
y = data["Yearly_Salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 15, 20],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_

# Predictions & evaluation
y_pred_rf = best_rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest MSE:", mse_rf)
print("Random Forest R² Score:", r2_rf)

# Feature Importance
rf_importances = best_rf_model.feature_importances_
rf_feature_importance = dict(zip(selected_features, rf_importances))
print("Random Forest Feature Importance:", rf_feature_importance)

xgb_model = xgb.XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions & evaluation
y_pred_xgb = xgb_model.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost MSE:", mse_xgb)
print("XGBoost R² Score:", r2_xgb)

# Feature Importance
xgb_importances = xgb_model.feature_importances_
xgb_feature_importance = dict(zip(selected_features, xgb_importances))
print("XGBoost Feature Importance:", xgb_feature_importance)



Random Forest MSE: 221903368.08616656
Random Forest R² Score: 0.1813608022028831
Random Forest Feature Importance: {'Department': 0.03977011291024907, 'Perf_Training': 0.45606401499814997, 'Perf_Projects': 0.504165872091601}
XGBoost MSE: 219852757.84627217
XGBoost R² Score: 0.18892592191696167
XGBoost Feature Importance: {'Department': 0.04414561, 'Perf_Training': 0.36240137, 'Perf_Projects': 0.59345305}


In [42]:
# Step 1: Define Features
selected_features = ["Department", "Performance_Score", "Years_At_Company", "Work_Hours_Per_Week", 
                     "Overtime_Hours", "Sick_Days", "Team_Size", "Perf_Training", "Perf_Projects"]
X = data[selected_features]
y = data["Yearly_Salary"]

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Hyperparameter Tuning (Place this **before** model training)
param_grid_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt", "log2"]
}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)  # Find best model parameters

# Step 4: Train the best model using optimized parameters
best_rf_model = grid_search_rf.best_estimator_
best_rf_model.fit(X_train, y_train)

# Step 5: Make Predictions & Evaluate Performance
y_pred_rf = best_rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest MSE:", mse_rf)
print("Random Forest R² Score:", r2_rf)



Random Forest MSE: 199243603.1938555
Random Forest R² Score: 0.26495652187897945


# Milestone Discussion

The model-building process has been a continual exploration, not just of predictions but of understanding how salary trends play out in the data. The first models, regression-based approaches, helped establish an initial baseline and highlight the impact of key workforce factors, like upskilling (training). Random Forest emerged as one of the stronger models, balancing complexity with interpretability and showing promise in capturing nonlinear interactions. Through hyperparameter tuning—adjusting parameters like the number of estimators and max depth—the best output landed at an MSE of 199,243,603.19 with an R² score of 0.26, reflecting a moderate predictive performance. Given this result, it’s likely that this represents one of the best possible outputs with the available data, and further tuning may not yield significant gains. That said, continuing to explore different models isn’t just about chasing a better score, it’s about understanding how each model interacts with the data, refining insights, and pushing the analysis forward. There’s still value in testing tweaks, adjusting features, and experimenting with different approaches, even if the numbers don’t shift dramatically. The process is just as much about learning from the models as it is about improving them.