In [45]:
import pandas as pd
import numpy as np

# Importing and loading the processed data

data_path = '../data/processed/Cleaned_Data.csv'
data = pd.read_csv(data_path)

# Removing the columns that are not needed for the model
data = data.drop(columns=['GroceriesIndex', 'RestaurantPriceIndex', 'SalaryUSD', 'SalaryRoundedUSD'])

data

Unnamed: 0,Country,JobTitle,Category,CostofLivingIndex,RentIndex,CostofLivingPlusRentIndex,LocalPurchasingPowerIndex,SalaryRangeinK
0,Afghanistan,Cardiovascular Specialist,Health and Medical,22.8,2.8,13.3,22.4,50-60
1,Afghanistan,Chief of Surgery,Health and Medical,22.8,2.8,13.3,22.4,60-70
2,Afghanistan,Invasive Cardiologist,Health and Medical,22.8,2.8,13.3,22.4,50-60
3,Afghanistan,Physician - Cardiology,Health and Medical,22.8,2.8,13.3,22.4,50-60
4,Afghanistan,Surgeon - Cardiothoracic,Health and Medical,22.8,2.8,13.3,22.4,50-60
...,...,...,...,...,...,...,...,...
88480,Venezuela,Telecommunication Service Delivery Manager,Telecommunication,39.4,5.7,23.4,12.6,50-60
88481,Venezuela,Telecommunication Solution Architect,Telecommunication,39.4,5.7,23.4,12.6,50-60
88482,Venezuela,Telecommunications Analyst,Telecommunication,39.4,5.7,23.4,12.6,50-60
88483,Venezuela,Telecommunications Assistant Manager,Telecommunication,39.4,5.7,23.4,12.6,50-60


In [49]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier

# Splitting the data into features and target
X = data.drop('SalaryRangeinK', axis=1)
y = data['SalaryRangeinK']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining numeriacal and categorical columns
num_cols = ['CostofLivingIndex', 'RentIndex', 'CostofLivingPlusRentIndex', 'LocalPurchasingPowerIndex']
cat_cols = ['Country', 'JobTitle', 'Category']
# Defining the numerical and categorical transformers
numerical_transformer = Pipeline([('imputer', KNNImputer(n_neighbors=7, weights='distance')), ('scaler', RobustScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', drop='first'))])
# Building the preprocessor
preprocessor = ColumnTransformer(transformers=[('num_transforms', numerical_transformer, num_cols), ('cat_transforms', categorical_transformer, cat_cols)])
# Displaying the preprocessor
preprocessor


In [50]:
# Creating the model pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])
model_pipeline

In [55]:
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, balanced_accuracy_score, log_loss
# Defining the scoring metrics
scoring = {
    'neg_log_loss': make_scorer(log_loss, greater_is_better=False, needs_proba=True),
    'roc_auc': make_scorer(roc_auc_score, multi_class='ovo', needs_proba=True), 
    'accuracy': make_scorer(accuracy_score),
    'balanced_accuracy': make_scorer(balanced_accuracy_score)
}

# Cross validating the model
cv_results = cross_validate(model_pipeline, X_train, y_train, scoring=scoring, cv=5)



In [56]:
# Create a dataframe from the cv_results dictionary
results_df = pd.DataFrame(cv_results)

# Sort the dataframe by the 'test_neg_log_loss' column in ascending order
sorted_results_df = results_df.sort_values('test_neg_log_loss')

# Display the sorted dataframe
print(sorted_results_df)

    fit_time  score_time  test_neg_log_loss  test_roc_auc  test_accuracy  \
3  26.207441    1.970603          -1.342091      0.691453       0.550046   
0  27.128890    2.118309          -1.325424      0.690086       0.556364   
2  26.748992    2.491123          -1.318178      0.690945       0.550572   
1  26.153416    2.008344          -1.264944      0.678158       0.548948   
4  26.685874    2.203571          -1.263642      0.687170       0.552377   

   test_balanced_accuracy  
3                0.339193  
0                0.344625  
2                0.337550  
1                0.338161  
4                0.338758  
