In [11]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pprint

In [30]:
train_df: pd.DataFrame = pd.read_pickle('train_df_20241121_223002.pkl')
val_df: pd.DataFrame = pd.read_pickle('val_df_20241121_223002.pkl')
test_df: pd.DataFrame = pd.read_pickle('test_df_20241121_223002.pkl')

In [3]:
# Run 1 debugging:

# Explore the data
print(len(train_df.columns))

# for col in train_df.columns:
#     unique_values = train_df[col].unique()
#     if len(unique_values) > 5:
#         unique_values = unique_values[:5]
#     print(f'{col}: {unique_values}')

# Invalid columns when training:MainBranch: object, RemoteWork: object, DevType: object, Country: object, AISelect: object
invalid_cols = ['MainBranch', 'RemoteWork', 'DevType', 'Country', 'AISelect']
for col in train_df.columns:
    if col in invalid_cols:
        unique_values = train_df[col].unique()
        # if len(unique_values) > 5:
        #     unique_values = unique_values[:5]
        print(f'{col} ({len(unique_values)}): {unique_values}')

# Fix for error: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`
# Convert invalid columns to category type
for col in invalid_cols:
    train_df[col] = train_df[col].astype('category')
    val_df[col] = val_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')


438
MainBranch (2): ['I am a developer by profession'
 'I am not primarily a developer, but I write code sometimes as part of my work/studies']
RemoteWork (3): ['Remote' 'Hybrid (some remote, some in-person)' 'In-person']
DevType (34): ['Developer, embedded applications or devices' 'Developer, back-end'
 'Other (please specify):' 'Developer, desktop or enterprise applications'
 'Academic researcher' 'Developer, full-stack' 'Security professional'
 'Project manager' 'Developer, game or graphics' 'Developer, mobile'
 'Developer, front-end' 'DevOps specialist' 'Engineering manager'
 'Research & Development role' 'Data or business analyst'
 'Cloud infrastructure engineer' 'Data engineer' 'Scientist'
 'Data scientist or machine learning specialist' 'Developer, QA or test'
 'Developer, AI' 'Engineer, site reliability' 'Designer' 'Blockchain'
 'Developer Advocate' 'Senior Executive (C-Suite, VP, etc.)' 'Student'
 'System administrator' 'Database administrator' 'Educator'
 'Hardware Engineer' 

In [46]:

# Run 2 debugging:
# Compare distributions
def compare_sets(train, val, test):
    print("Salary Statistics:")
    print("\nTraining:")
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    print(train['ConvertedCompYearly'].describe())
    print("\nValidation:")
    print(val['ConvertedCompYearly'].describe())
    print("\nTest:")
    print(test['ConvertedCompYearly'].describe())
    pd.reset_option('display.float_format')
    
    # Compare feature distributions too
    for col in train.columns:
        if len(train[col].unique()) == 2: continue
        if train[col].dtype in ['int64', 'float64']:
            print(f"\n{col} mean values:")
            print(f"Train: {train[col].mean()}")
            print(f"Val: {val[col].mean()}")
            print(f"Test: {test[col].mean()}")
            
# Prepare the data
target = 'ConvertedCompYearly'
# X_train = train_df.drop(target, axis=1)
X_train = train_df
y_train = train_df[target]
# X_val = val_df.drop(target, axis=1)
X_val = val_df
y_val = val_df[target]
# X_test = test_df.drop(target, axis=1)
X_test = test_df
y_test = test_df[target]
compare_sets(X_train, X_val, X_test)

Salary Statistics:

Training:
count      15705.00
mean       87206.58
std       166328.35
min            1.00
25%        35444.00
50%        66877.00
75%       110000.00
max     13818022.00
Name: ConvertedCompYearly, dtype: float64

Validation:
count      3365.00
mean      87091.53
std       99846.19
min           1.00
25%       35444.00
50%       68740.00
75%      111417.00
max     2048046.00
Name: ConvertedCompYearly, dtype: float64

Test:
count       3366.00
mean       93135.21
std       317679.98
min            1.00
25%        36389.00
50%        67666.00
75%       110861.50
max     16256603.00
Name: ConvertedCompYearly, dtype: float64

Age mean values:
Train: 0.3536544321644608
Val: 0.35347060072171504
Test: 0.35446906035141323

EdLevel mean values:
Train: 0.6627719409954366
Val: 0.6559683011391777
Test: 0.6600316894434543

YearsCode mean values:
Train: 0.29893564557309715
Val: 0.2989831891151706
Test: 0.29997786399170484

YearsCodePro mean values:
Train: 0.2041949922280278
Val: 0

In [4]:
# Prepare the data
target = 'ConvertedCompYearly'
X_train = train_df.drop(target, axis=1)
y_train = train_df[target]
X_val = val_df.drop(target, axis=1)
y_val = val_df[target]
X_test = test_df.drop(target, axis=1)
y_test = test_df[target]



In [21]:
# Initialize XGBoost model

# Run 1: overfitting
# xgb_model = xgb.XGBRegressor(
#     n_estimators=1000,
#     learning_rate=0.01,
#     max_depth=6,
#     min_child_weight=1,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42,
#     enable_categorical=True
# )

# Run 2: data leakage, or validation set was split weirdly? (okay validation performance, bad test performance)
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=4,  # Reduced from 6
    min_child_weight=3,  # Increased from 1
    subsample=0.8,
    colsample_bytree=0.6,  # Reduced from 0.8
    reg_alpha=1,  # Added L1 regularization
    reg_lambda=1,  # Added L2 regularization
    random_state=42,
    enable_categorical=True
)


In [22]:
# Train the model with early stopping

# Error w categorical columns: 
# Invalid columns:MainBranch: object, RemoteWork: object, DevType: object, Country: object, AISelect: object

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=100
)



[0]	validation_0-rmse:165208.60519	validation_1-rmse:98552.89581
[100]	validation_0-rmse:113005.51754	validation_1-rmse:56201.47884
[200]	validation_0-rmse:84299.10809	validation_1-rmse:40707.66277
[300]	validation_0-rmse:67432.93179	validation_1-rmse:36133.33934
[400]	validation_0-rmse:52459.31208	validation_1-rmse:35066.87056
[500]	validation_0-rmse:43395.52373	validation_1-rmse:34865.28414
[600]	validation_0-rmse:37080.00069	validation_1-rmse:34939.20552
[700]	validation_0-rmse:32341.53889	validation_1-rmse:34857.36998
[800]	validation_0-rmse:28255.95165	validation_1-rmse:35039.81955
[900]	validation_0-rmse:24742.28120	validation_1-rmse:35375.14007
[999]	validation_0-rmse:22192.07405	validation_1-rmse:35329.04046


In [27]:
# Make predictions
train_pred = xgb_model.predict(X_train)
val_pred = xgb_model.predict(X_val)
test_pred = xgb_model.predict(X_test)


# Print statistics for target variable
print("\nTarget Variable Statistics:")
print("-" * 30)
print(f"Train Mean: {y_train.mean():.4f}, Std: {y_train.std():.4f}, Skew: {y_train.skew():.4f}")
print(f"Validation Mean: {y_val.mean():.4f}, Std: {y_val.std():.4f}, Skew: {y_val.skew():.4f}")
print(f"Test Mean: {y_test.mean():.4f}, Std: {y_test.std():.4f}, Skew: {y_test.skew():.4f}")

# Plot distribution of target variable
fig = make_subplots(rows=1, cols=3, subplot_titles=["Train", "Validation", "Test"])

fig.add_trace(go.Histogram(x=y_train, nbinsx=50), row=1, col=1)
fig.add_trace(go.Histogram(x=y_val, nbinsx=50), row=1, col=2)
fig.add_trace(go.Histogram(x=y_test, nbinsx=50), row=1, col=3)

fig.update_layout(title_text="Target Variable Distribution", height=400, width=1200)
fig.show()


# Calculate metrics
print("\nModel Performance Metrics:")
print("-" * 30)

print('''\nNotes: 
- MAPE is not a great metric to use because of scaling issues (could predict $1000 when the true value is $80k for one sample which produces an outlier prediction that screws with the entire metric)
- SMAPE (Symmetric Mean Absolute Percentage Error) is an interesting metric that is more resistant to outliers, but is kinda weird to interpret given that it has both actual and predicted in numerator and denominator which introduces issues with 0 predictions
''')

def smape(actual, predicted):
    """Calculate Symmetric Mean Absolute Percentage Error
    SMAPE = (100/n) * Σ(|actual - predicted| / (|actual| + |predicted|))"""
    return 100 * np.mean(np.abs(predicted - actual) / (np.abs(actual) + np.abs(predicted)))

# Training metrics
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
train_r2 = r2_score(y_train, train_pred)
train_mape = np.mean(np.abs((y_train - train_pred) / y_train)) * 100
train_smape = smape(y_train, train_pred)
print(f"Training RMSE: {train_rmse:.4f}")
print(f"Training R2: {train_r2:.4f}")
print(f"Training MAPE: {train_mape:.4f}%")
print(f"Training SMAPE: {train_smape:.4f}%")

# Validation metrics
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
val_r2 = r2_score(y_val, val_pred)
val_mape = np.mean(np.abs((y_val - val_pred) / y_val)) * 100
val_smape = smape(y_val, val_pred)
print(f"\nValidation RMSE: {val_rmse:.4f}")
print(f"Validation R2: {val_r2:.4f}")
print(f"Validation MAPE: {val_mape:.4f}%")
print(f"Validation SMAPE: {val_smape:.4f}%")

# Test metrics
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_r2 = r2_score(y_test, test_pred)
test_mape = np.mean(np.abs((y_test - test_pred) / y_test)) * 100
test_smape = smape(y_test, test_pred)
print(f"\nTest RMSE: {test_rmse:.4f}")
print(f"Test R2: {test_r2:.4f}")
print(f"Test MAPE: {test_mape:.4f}%")
print(f"Test SMAPE: {test_smape:.4f}%")


# Feature importance plot
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)


fig = go.Figure(data=[go.Bar(
    x=feature_importance['feature'],
    y=feature_importance['importance'],
    text=feature_importance['importance'],
    textposition='inside',
    orientation='v',
    marker=dict(
        line=dict(color='rgba(246, 78, 139, 1.1)', width=1),
        color='rgba(58, 71, 80, 0.6)'
    ),
)])
fig.update_layout(
    title_text='XGBoost Feature Importance',
    xaxis_title_text='Features',
    yaxis_title_text='Importance',
    bargap=0.2,
    bargroupgap=0.1
)
fig.show()



Target Variable Statistics:
------------------------------
Train Mean: 87206.5813, Std: 166328.3515, Skew: 48.6787
Validation Mean: 87091.5302, Std: 99846.1929, Skew: 7.7224
Test Mean: 93135.2077, Std: 317679.9762, Skew: 41.8945



Model Performance Metrics:
------------------------------

Notes: 
- MAPE is not a great metric to use because of scaling issues (could predict $1000 when the true value is $80k for one sample which produces an outlier prediction that screws with the entire metric)
- SMAPE (Symmetric Mean Absolute Percentage Error) is an interesting metric that is more resistent to outliers, but is kinda weird to interpret given that it has both actual and predicted in numerator and denominator which introduces issues with 0 predictions

Training RMSE: 22192.0740
Training R2: 0.9822
Training MAPE: 793.2532%
Training SMAPE: 7.1322%

Validation RMSE: 35329.0408
Validation R2: 0.8748
Validation MAPE: 3874.1284%
Validation SMAPE: 9.8353%

Test RMSE: 251651.5313
Test R2: 0.3723
Test MAPE: 3543.1077%
Test SMAPE: 10.4336%
