<a href="https://colab.research.google.com/github/cbonnin88/Compensation_Analysis/blob/main/income_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
import polars as pl
import pandas as pd
import polars.selectors as cs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import plotly.express as px
import numpy as np
import gdown as gd

In [129]:
url = 'https://drive.google.com/uc?id=1pdVt8l0Scn-AzHuom_mztZuUmv-Xiv0f'
gd.download(url,'synthetic_employee_data.csv',quiet=True)

enforced_dtypes = {
    'gender':pl.Utf8,
    'education_level':pl.Utf8,
    'job_title':pl.Utf8,
    'department':pl.Utf8,
    'city':pl.Utf8,
    'country':pl.Utf8
    }

In [130]:
income_data = pl.read_csv('synthetic_employee_data.csv',schema_overrides=enforced_dtypes)

income_data.head()

age,gender,education_level,job_title,department,years_of_experience,performance_rating,hours_per_week,city,country,salary
i64,str,str,str,str,i64,i64,i64,str,str,f64
59,"""Male""","""PhD""","""Compensation Analyst""","""Sales""",35,2,43,"""London""","""United Kingdom""",335451.51
49,"""Male""","""PhD""","""Senior Data Scientist""","""Finance""",26,2,37,"""Paris""","""France""",378571.95
35,"""Male""","""Bachelors""","""Junior Data Analyst""","""Finance""",11,2,34,"""Copenhagen""","""Denmark""",107713.8
63,"""Female""","""Bachelors""","""Sales Representative""","""Human Resources""",37,3,39,"""Paris""","""France""",228020.41
28,"""Male""","""Masters""","""Talent Acquisition Specialst""","""Product""",6,3,39,"""Remote""","""Denmark""",42083.32


In [131]:
# Calculating the median salary
median_salary = income_data.select('salary').median().item()
print(f'The median salary in the dataset is: €{median_salary:,.2f}')

The median salary in the dataset is: €140,849.92


In [132]:
# Creating a new binary target column 'income_above_median'
# 1 if salary > median_salary, 0 otherwise

income_data = income_data.with_columns(
    pl.when(pl.col('salary')> median_salary)
      .then(pl.lit(1))
      .otherwise(pl.lit(0))
      .alias('income_above_median')
)

In [133]:
# Separate features (X) and target (y)
# Dropping the original 'salary' column and our new traget column form the features.

X = income_data.drop('salary', 'income_above_median')
y = income_data.select('income_above_median')

In [134]:
# Identify categorical and numerical features from the Polars schema
numerical_features = X.select(cs.numeric()).columns
categorical_features = X.select(cs.string()).columns

In [135]:
print(f'Identified Numerical Features: {numerical_features}')
print(f'Identified Categorical Features: {categorical_features}')

Identified Numerical Features: ['age', 'years_of_experience', 'performance_rating', 'hours_per_week']
Identified Categorical Features: ['gender', 'education_level', 'job_title', 'department', 'city', 'country']


In [136]:
# Converting Polars DataFrame to Pandas for compatibility with scikit-learn's pipeline
X_pd = X.to_pandas()
y_pd = y.to_pandas().squeeze() # use .squeeze() to convert from a DataFrame to a Series

In [137]:
# Create a transformer for numerical features : Scaling them
numeric_transformer = StandardScaler()

In [138]:
# Create a transformer for categorical features: we'll one-hot encode them
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [139]:
# Create a preprocessor object using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers =[
        ('num',numeric_transformer, numerical_features),
        ('cat',categorical_transformer,categorical_features)
    ]
)

In [140]:
# Using Logistic Regression, a fundamental and interpretable model for classification
model = LogisticRegression(max_iter=1000)

In [141]:
# Creating the full pipeline by combining the preprocessor and the model
pipeline =  Pipeline(
    steps=[('preprocessor',preprocessor),
           ('classifier', model)]
)

In [142]:
# Split the data into training and testing sets using the pandas DataFrame
X_train,X_test,y_train,y_test = train_test_split(X_pd,y_pd,test_size=0.2,random_state=42)

In [143]:
# Training the model
pipeline.fit(X_train,y_train)

In [144]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [149]:
# Calculate Accuracy
accuracy = accuracy_score(y_test,y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 0.77


In [153]:
# A more detailed classification report
print('Classifcation Report')
print(classification_report(y_test,y_pred))

Classifcation Report
              precision    recall  f1-score   support

           0       0.77      0.76      0.77      2494
           1       0.77      0.77      0.77      2506

    accuracy                           0.77      5000
   macro avg       0.77      0.77      0.77      5000
weighted avg       0.77      0.77      0.77      5000



# **An interactive confusion matrix using Plotly**

In [154]:
cm = confusion_matrix(y_test,y_pred)

In [174]:
fig_income = px.imshow(
    cm,
    text_auto = True,
    labels=dict(
        x='Predicted Label', y='True Label', color='Count'),
        x=['Below Median', 'Above Median'],
        y=['Below Median','Above Median'],
        color_continuous_scale = px.colors.sequential.Viridis
    )
fig_income.update_layout(
    title_text='Confusion Matrix',
    title_x=0.5,
    coloraxis_showscale=False
)



fig_income.show()

In [176]:
# Get the feature names after one-hot encoding
one_hot_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([numerical_features, one_hot_feature_names])

In [178]:
# Get the coefficients from the logistics regression model
coefficients = pipeline.named_steps['classifier'].coef_[0]

In [182]:
# Create a DataFrame to view the coefficients
feature_importance = pd.DataFrame({'feature':all_feature_names,'coefficient':coefficients})

In [188]:
# Display the features with the largest positive and negative imparce on predicting an above-median salary
top_10 = feature_importance.sort_values(by='coefficient',ascending=False).round(2).head(10)
bottom_10 = feature_importance.sort_values(by='coefficient',ascending=True).round(2).head(10)
plot_df = pd.concat([top_10,bottom_10])

In [189]:
plot_df['influence'] = np.where(plot_df['coefficient']>0,'Positive','Negative')

In [190]:
print('Feature Importance Chart:\n')

fig_importance = px.bar(
    plot_df.sort_values(by='coefficient'),
    x='coefficient',
    y='feature',
    color='influence',
    color_discrete_map={
        'Positive':'#77dd77',
        'Negative':'#ff6961'
    },
    orientation='h',
    labels={'coefficient':'Coefficient (Impact on Salary)','feature':'Feature'},
    title='Top Features Influencing Salary Prediction'
)

fig_importance.update_layout(
    title_x=0.5,
    yaxis={'categoryorder':'total ascending'}
)
fig_importance.show()

Feature Importance Chart:



In [192]:
# Displaying the feature with the largest positive and negative impact as text:
print('Top Ten Features Influencing an Above-Median Salary\n')
display(feature_importance.sort_values(by='coefficient', ascending=False).round(2).head(10))

Top Ten Features Influencing an Above-Median Salary


Unnamed: 0,feature,coefficient
11,job_title_Chief of Product,2.6
34,job_title_Vice President of Data,2.26
15,job_title_Director of Sales,1.67
10,education_level_PhD,1.32
1,years_of_experience,1.02
22,job_title_Principal Engineer,0.94
19,job_title_Lead Frontend Engineer,0.92
44,city_London,0.81
27,job_title_Sales Manager,0.8
20,job_title_Lead Product Analyst,0.7


In [193]:
print('Top Ten Featurs Influencing a Below-Median Salary\n')
display(feature_importance.sort_values(by='coefficient',ascending=True).round(2).head(10))

Top Ten Featurs Influencing a Below-Median Salary



Unnamed: 0,feature,coefficient
18,job_title_Junior Data Analyst,-2.12
13,job_title_Data Analyst,-1.55
8,education_level_High School,-1.35
17,job_title_Frontend Engineer,-1.27
32,job_title_Talent Acquisition Specialst,-1.17
25,job_title_Product Owner,-1.16
33,job_title_UX/UI Designer,-0.98
46,city_Remote,-0.78
16,job_title_Financial and Planning Analyst,-0.72
23,job_title_Product Analyst,-0.67
