In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

file_path = 'Global_Salary_Data_2024.csv'
data = pd.read_csv(file_path)
data.head()


Unnamed: 0,Country,JobTitle,Category,CostofLivingIndex,RentIndex,CostofLivingPlusRentIndex,GroceriesIndex,RestaurantPriceIndex,LocalPurchasingPowerIndex,SalaryRange
0,Canada,Materials Planner,Automotive,68.4,36.0,53.1,70.7,64.0,83.3,50-60
1,Canada,Field Inspector,Construction / Building / Installation,68.4,36.0,53.1,70.7,64.0,83.3,50-60
2,China,Tax Analyst,Accounting and Finance,34.1,13.4,24.3,37.6,21.0,60.5,50-60
3,China,Corporate Officer,Executive and Management,34.1,13.4,24.3,37.6,21.0,60.5,50-60
4,China,Retail District Manager,Executive and Management,34.1,13.4,24.3,37.6,21.0,60.5,50-60


In [2]:
X = data.drop('SalaryRange', axis=1)
y = data['SalaryRange']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

num_cols = ['CostofLivingIndex', 'RentIndex', 'CostofLivingPlusRentIndex', 'LocalPurchasingPowerIndex']
cat_cols = ['Country', 'JobTitle', 'Category']
numerical_transformer = Pipeline([('imputer', KNNImputer(n_neighbors=7, weights='distance')), ('scaler', RobustScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', drop='first'))])
preprocessor = ColumnTransformer(transformers=[('num_transforms', numerical_transformer, num_cols), ('cat_transforms', categorical_transformer, cat_cols)])


In [3]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

In [4]:
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, balanced_accuracy_score, log_loss
import warnings
warnings.filterwarnings('ignore')

scoring = {
    'neg_log_loss': make_scorer(log_loss, greater_is_better=False, needs_proba=True),
    'roc_auc': make_scorer(roc_auc_score, multi_class='ovo', needs_proba=True), 
    'accuracy': make_scorer(accuracy_score),
    'balanced_accuracy': make_scorer(balanced_accuracy_score)
}

# Cross validating the model
cv_results = cross_validate(model_pipeline, X_train, y_train, scoring=scoring, cv=5)
# Print the results
for metric in scoring:
    print(f"{metric}: {cv_results['test_' + metric].mean()}")

neg_log_loss: -1.7868617444002548
roc_auc: 0.718853160475802
accuracy: 0.4540758081731764
balanced_accuracy: 0.332020877416422


In [5]:
# Fit the model
model_pipeline.fit(X_train, y_train)

# Predict the probabilities
y_pred_proba = model_pipeline.predict_proba(X_test)

# Predict the class labels
y_pred = model_pipeline.predict(X_test)

# Evaluate all metrics
log_loss_score = log_loss(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')

print(f"Log loss: {log_loss_score}")
print(f"Accuracy: {accuracy}")
print(f"Balanced accuracy: {balanced_accuracy}")
print(f"ROC AUC: {roc_auc}")

Log loss: 1.7660805849394403
Accuracy: 0.4554735176674452
Balanced accuracy: 0.33418498213423825
ROC AUC: 0.7268828849486194


In [6]:
param_grid = [
    {
        'model': [RandomForestClassifier()],
        'preprocessor__num_transforms__imputer__n_neighbors': [5, 7],
        'preprocessor__num_transforms__imputer__weights': ['uniform', 'distance'],
        'preprocessor__cat_transforms__imputer__strategy': ['most_frequent', 'constant'],
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__bootstrap': [True, False]
    },
    {
        'model': [LogisticRegression(max_iter=1000)],
        'preprocessor__num_transforms__imputer__n_neighbors': [5, 7],
        'preprocessor__num_transforms__imputer__weights': ['uniform', 'distance'],
        'preprocessor__cat_transforms__imputer__strategy': ['most_frequent', 'constant'],
        'model__C': [0.1, 1.0 ,10.0],
        'model__penalty': ['l2'],
        'model__solver': ['lbfgs', 'liblinear']
    }
]



#Set up GridSearchCV
grid_search = GridSearchCV(model_pipeline, param_grid, scoring=scoring, cv=5, n_jobs=-1, refit = "neg_log_loss")
# Fit the model
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

# Evaluate on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test set score: {test_score}")





Best parameters: {'model': LogisticRegression(max_iter=1000), 'model__C': 10.0, 'model__penalty': 'l2', 'model__solver': 'lbfgs', 'preprocessor__cat_transforms__imputer__strategy': 'most_frequent', 'preprocessor__num_transforms__imputer__n_neighbors': 5, 'preprocessor__num_transforms__imputer__weights': 'uniform'}
Best cross-validation score: -0.9689534387692369
Test set score: 0.6659007006705342


In [7]:
# Fit the best model on the full training data
best_model.fit(X_train, y_train)

# Predict the probabilities
y_pred_proba = best_model.predict_proba(X_test)

# Predict the class labels
y_pred = best_model.predict(X_test)

# Evaluate all metrics
log_loss_score = log_loss(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')

print(f"Log loss: {log_loss_score}")
print(f"Accuracy: {accuracy}")
print(f"Balanced accuracy: {balanced_accuracy}")
print(f"ROC AUC: {roc_auc}")


Log loss: 0.9242528432174086
Accuracy: 0.6659007006705342
Balanced accuracy: 0.555759652016011
ROC AUC: 0.8651152866227414


In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

class_names = ['50-60', '60-70', '70-80', '80-90', '90-100', '100+']


# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

ModuleNotFoundError: No module named 'matplotlib'