In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pprint

In [2]:
train_df: pd.DataFrame = pd.read_pickle('train_df_20241121_223002.pkl')
val_df: pd.DataFrame = pd.read_pickle('val_df_20241121_223002.pkl')
test_df: pd.DataFrame = pd.read_pickle('test_df_20241121_223002.pkl')

In [3]:
# Explore the data
print(len(train_df.columns))

# for col in train_df.columns:
#     unique_values = train_df[col].unique()
#     if len(unique_values) > 5:
#         unique_values = unique_values[:5]
#     print(f'{col}: {unique_values}')

# Invalid columns when training:MainBranch: object, RemoteWork: object, DevType: object, Country: object, AISelect: object
invalid_cols = ['MainBranch', 'RemoteWork', 'DevType', 'Country', 'AISelect']
for col in train_df.columns:
    if col in invalid_cols:
        unique_values = train_df[col].unique()
        # if len(unique_values) > 5:
        #     unique_values = unique_values[:5]
        print(f'{col} ({len(unique_values)}): {unique_values}')

# Fix for error: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`
# Convert invalid columns to category type
for col in invalid_cols:
    train_df[col] = train_df[col].astype('category')
    val_df[col] = val_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')


438
MainBranch (2): ['I am a developer by profession'
 'I am not primarily a developer, but I write code sometimes as part of my work/studies']
RemoteWork (3): ['Remote' 'Hybrid (some remote, some in-person)' 'In-person']
DevType (34): ['Developer, embedded applications or devices' 'Developer, back-end'
 'Other (please specify):' 'Developer, desktop or enterprise applications'
 'Academic researcher' 'Developer, full-stack' 'Security professional'
 'Project manager' 'Developer, game or graphics' 'Developer, mobile'
 'Developer, front-end' 'DevOps specialist' 'Engineering manager'
 'Research & Development role' 'Data or business analyst'
 'Cloud infrastructure engineer' 'Data engineer' 'Scientist'
 'Data scientist or machine learning specialist' 'Developer, QA or test'
 'Developer, AI' 'Engineer, site reliability' 'Designer' 'Blockchain'
 'Developer Advocate' 'Senior Executive (C-Suite, VP, etc.)' 'Student'
 'System administrator' 'Database administrator' 'Educator'
 'Hardware Engineer' 

In [4]:
# Prepare the data
target = 'ConvertedCompYearly'
X_train = train_df.drop(target, axis=1)
y_train = train_df[target]
X_val = val_df.drop(target, axis=1)
y_val = val_df[target]
X_test = test_df.drop(target, axis=1)
y_test = test_df[target]



In [7]:
# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    enable_categorical=True
)


In [8]:
# Train the model with early stopping

# Error w categorical columns: 
# Invalid columns:MainBranch: object, RemoteWork: object, DevType: object, Country: object, AISelect: object

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=100
)



[0]	validation_0-rmse:165123.21600	validation_1-rmse:99057.99768
[100]	validation_0-rmse:98079.20581	validation_1-rmse:57746.76978
[200]	validation_0-rmse:62677.59998	validation_1-rmse:43398.70877
[300]	validation_0-rmse:41439.55873	validation_1-rmse:38541.82834
[400]	validation_0-rmse:27293.90096	validation_1-rmse:36630.88925
[500]	validation_0-rmse:18355.30554	validation_1-rmse:36125.38273
[600]	validation_0-rmse:12826.72235	validation_1-rmse:35973.85609
[700]	validation_0-rmse:9138.64256	validation_1-rmse:36206.78824
[800]	validation_0-rmse:6561.30896	validation_1-rmse:36442.24571
[900]	validation_0-rmse:4874.22386	validation_1-rmse:36708.94001
[999]	validation_0-rmse:3870.74721	validation_1-rmse:36953.69735


In [10]:
# Make predictions
train_pred = xgb_model.predict(X_train)
val_pred = xgb_model.predict(X_val)
test_pred = xgb_model.predict(X_test)

# Calculate metrics
print("\nModel Performance Metrics:")
print("-" * 30)

# Training metrics
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
train_r2 = r2_score(y_train, train_pred)
print(f"Training RMSE: {train_rmse:.4f}")
print(f"Training R2: {train_r2:.4f}")

# Validation metrics
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
val_r2 = r2_score(y_val, val_pred)
print(f"\nValidation RMSE: {val_rmse:.4f}")
print(f"Validation R2: {val_r2:.4f}")

# Test metrics
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
print(f"\nTest RMSE: {test_rmse:.4f}")
print(f"Test R2: {test_r2:.4f}")

# Feature importance plot
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

import plotly.graph_objects as go

fig = go.Figure(data=[go.Bar(
    x=feature_importance['feature'],
    y=feature_importance['importance'],
    text=feature_importance['importance'],
    textposition='inside',
    orientation='v',
    marker=dict(
        line=dict(color='rgba(246, 78, 139, 1.1)', width=1),
        color='rgba(58, 71, 80, 0.6)'
    ),
)])
fig.update_layout(
    title_text='XGBoost Feature Importance',
    xaxis_title_text='Features',
    yaxis_title_text='Importance',
    bargap=0.2,
    bargroupgap=0.1
)
fig.show()



Model Performance Metrics:
------------------------------
Training RMSE: 3870.7472
Training R2: 0.9995

Validation RMSE: 36953.6974
Validation R2: 0.8630

Test RMSE: 140903.9077
Test R2: 0.8032
