<a href="https://colab.research.google.com/github/cbonnin88/Compensation_Analysis/blob/main/compensation_training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import gdown as gd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
url = 'https://drive.google.com/uc?id=1u-DL-ILOiZx9-xlGbZATRHoFBhkon5GM'

gd.download(url,'employee_compensation_data.csv',quiet=True)

df_comp = pd.read_csv('employee_compensation_data.csv')

In [None]:
df_comp.head()

Unnamed: 0,employee_id,job_title,department,city,country,work_model,experience_years,tenure,age,performance_rating,education_level,base_salary,bonus,stock_options
0,MKT1001,Content Strategist,Marketing,Copenhagen,Denmark,Hybrid,11,4,29,3.8,Bachelor,161721,9796,29157
1,FIN1001,Compliance Officer,Finance & Legal,London,United Kingdom,Remote,12,11,32,4.4,Bachelor,257554,28076,139590
2,SAL1001,Customer Success Manager,Sales & Business Development,Copenhagen,Denmark,Hybrid,4,2,23,3.1,PhD,196028,74397,231288
3,DAT1001,Machine Learning Engineer,Data & Analytics,Dusseldorf,Germany,On-site,12,12,30,3.0,PhD,69178,2885,83320
4,SAL1002,Sales Manager,Sales & Business Development,Copenhagen,Denmark,On-site,16,9,40,3.4,Master,348794,66758,61917


In [None]:
# Defining features (X) and target (y)
features = ['experience_years','job_title','department','country','education_level']
target = 'base_salary'

X = df_comp[features]
y = df_comp[target]

In [None]:
# Splitting data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [None]:
# Create a Preprocessing pipeline
# This handles categorical features by converting them into numbers (one-hot encoding)
categorical_features = ['job_title','department','country','education_level']
preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),categorical_features),
    remainder='passthrough'
)

In [None]:
model = make_pipeline(
    preprocessor,
    LinearRegression()
)

model.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# Evaluating the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [None]:
print(f'Model Mean Absolute Error: €{mae:,.2f}')
print(f'Model R-squared: {r2:.4f}')

Model Mean Absolute Error: €77,803.44
Model R-squared: -0.0005


In [None]:
# Predict a new, hypothetical employee
new_employee_data = pd.DataFrame({
    'experience_years': [10],
    'job_title':['Head of Product Analytics'],
    'department':['Product & Design'],
    'country':['France'],
    'education_level':['Bachelor']
})

In [None]:
predicted_salary = model.predict(new_employee_data)
print(f'\nPredicted Salary for the new employee: €{predicted_salary[0]:,.2f}')


Predicted Salary for the new employee: €194,890.46
