# <ins>**Model Development and Tuning**</ins>


---


## **Salary Prediction using different models**


In [13]:
# Required Imports

# General Purpose Libraries
import numpy as np
import pandas as pd

# Machine Learning Models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.naive_bayes import GaussianNB

# Model Evaluation and Metrics
from sklearn.metrics import (
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    make_scorer,
)
from sklearn.preprocessing import StandardScaler

# Data Splitting and Model Tuning
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress Warnings
import warnings
warnings.filterwarnings("ignore")


# Deep Learning
# from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout

In [2]:
df_job_postings = pd.read_csv("../data/transformed/job_postings_prepared.csv")

In [3]:
df_job_postings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121032 entries, 0 to 121031
Data columns (total 33 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   job_id                          121032 non-null  int64  
 1   job_posting_url                 121032 non-null  object 
 2   company_id                      121032 non-null  int64  
 3   name                            121032 non-null  object 
 4   country                         121032 non-null  object 
 5   country_enc                     121032 non-null  int64  
 6   state                           121032 non-null  object 
 7   city                            121032 non-null  object 
 8   title                           121032 non-null  object 
 9   title_enc                       121032 non-null  int64  
 10  description                     121032 non-null  object 
 11  formatted_experience_level      121032 non-null  object 
 12  formatted_experi

In [4]:
df_job_postings[
    [
        "remote_allowed",
        "work_type_enc",
        "company_id",
        "formatted_experience_level_enc",
        "normalized_salary",
        "country_enc",
        "title_enc",
        "experience_derived",
        "workhours_derived",
        "employee_count",
        "follower_count",
        "applies",
        "views",
    ]
].corr()["normalized_salary"]

remote_allowed                    0.005464
work_type_enc                    -0.004289
company_id                       -0.005628
formatted_experience_level_enc    0.002822
normalized_salary                 1.000000
country_enc                       0.004896
title_enc                        -0.000540
experience_derived                0.010634
workhours_derived                 0.002314
employee_count                    0.001129
follower_count                   -0.000630
applies                          -0.000559
views                            -0.000971
Name: normalized_salary, dtype: float64

In [5]:
df_job_postings.isna().sum()

job_id                               0
job_posting_url                      0
company_id                           0
name                                 0
country                              0
country_enc                          0
state                                0
city                                 0
title                                0
title_enc                            0
description                          0
formatted_experience_level           0
formatted_experience_level_enc       0
work_type                            0
work_type_enc                        0
remote_allowed                       0
normalized_salary                 7834
experience_derived                   0
workhours_derived                    0
currency                             0
views                                0
applies                              0
listed_time                          0
original_listed_time                 0
expiry                               0
url                      

In [6]:
# Choose relevant features
features = [
    "experience_derived",
    "workhours_derived",
    "remote_allowed",
    "employee_count",
    "work_type_enc",
    "formatted_experience_level_enc",
    "country_enc",
    "follower_count",
]
target = "normalized_salary"

df_job_postings_filtered = df_job_postings.dropna(subset=[target])

X = df_job_postings_filtered[features]
y = df_job_postings_filtered[target]

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

# Create a scorer function for F1 score (adapted for regression)
def f1_regression(y_true, y_pred, threshold=50000):  # Adjust threshold as needed
    y_pred_binary = (y_pred >= threshold).astype(int)
    y_true_binary = (y_true >= threshold).astype(int)
    return f1_score(y_true_binary, y_pred_binary)


f1_scorer = make_scorer(f1_regression)

In [7]:
# Define Linear Regression Model
model = LinearRegression()

# Define the hyperparameter grid
param_grid = {
    "fit_intercept": [True, False]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5)  # 5-fold cross-validation
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# 3. Model Evaluation
y_pred = best_model.predict(X_test)
f1 = f1_regression(y_test, y_pred)  # Use the adapted F1 score for regression

print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score: {f1}")

Best Hyperparameters: {'fit_intercept': True}
F1 Score: 0.8462088332907838


In [8]:
# Define the model
model = Lasso()

# Define the hyperparameter grid
param_grid = {
    "alpha": [0.001, 0.01, 0.1, 0.5, 1, 5, 10],  # Regularization strength
    "max_iter": [1000, 5000, 10000],  # Maximum number of iterations
}

# Custom F1 scorer for threshold-based regression evaluation
def f1_regression(y_true, y_pred, threshold=50000):
    y_pred_binary = (y_pred >= threshold).astype(int)
    y_true_binary = (y_true >= threshold).astype(int)
    return f1_score(y_true_binary, y_pred_binary)

f1_scorer = make_scorer(f1_regression)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5) # Use F1 scorer for binary evaluation
grid_search.fit(X_train, y_train)

# Extract the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predictions and Evaluation
y_pred = best_model.predict(X_test)

# Custom F1 Score
f1 = f1_regression(y_test, y_pred)

# Standard Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score (Threshold): {f1}")
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R² Score: {r2}")


Best Hyperparameters: {'alpha': 10, 'max_iter': 1000}
F1 Score (Threshold): 0.8462088332907838


In [9]:
# Define the model
model = Ridge()

# Define the hyperparameter grid
param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strengths
    "solver": ["svd", "cholesky", "lsqr", "sag", "saga"],  # Solver options
}

# Custom F1 scorer for threshold-based regression evaluation
def f1_regression(y_true, y_pred, threshold=50000):
    y_pred_binary = (y_pred >= threshold).astype(int)
    y_true_binary = (y_true >= threshold).astype(int)
    return f1_score(y_true_binary, y_pred_binary)

f1_scorer = make_scorer(f1_regression)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5)
grid_search.fit(X_train, y_train)

# Extract the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predictions and Evaluation
y_pred = best_model.predict(X_test)

# Custom F1 Score
f1 = f1_regression(y_test, y_pred)

# Standard Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score (Threshold): {f1}")
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R² Score: {r2}")

Best Hyperparameters: {'alpha': 0.001, 'solver': 'lsqr'}
F1 Score (Threshold): 0.8474852372225616


In [10]:
# Define ElasticNet model
model = ElasticNet()

# Define the hyperparameter grid
param_grid = {
    "alpha": [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100],  # Regularization strengths
    "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],  # Balance between L1 and L2 penalties
    "max_iter": [1000, 5000, 10000],  # Maximum iterations
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5)
grid_search.fit(X_train, y_train)

# Extract the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predictions and Evaluation
y_pred = best_model.predict(X_test)

# Custom F1 Score
f1 = f1_regression(y_test, y_pred)

# Standard Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score (Threshold): {f1}")
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R² Score: {r2}")

Best Hyperparameters: {'alpha': 100, 'l1_ratio': 0.1, 'max_iter': 1000}
F1 Score (Threshold): 0.8466554587599979


In [11]:
# Define DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=42)

# Define the hyperparameter grid
param_grid = {
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [None, "sqrt", "log2"]
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5)
grid_search.fit(X_train, y_train)

# Extract the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predictions and Evaluation
y_pred = best_model.predict(X_test)

# Custom F1 Score
f1 = f1_regression(y_test, y_pred)

# Standard Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score (Threshold): {f1}")
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R² Score: {r2}")

Best Hyperparameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
F1 Score (Threshold): 0.8865240282289683


In [None]:
model = RandomForestRegressor(random_state=42)  # Use RandomForestRegressor
param_grid = {
    "n_estimators": [100, 200, 300],  # Number of trees in the forest
    "max_depth": [None, 10, 20, 30],  # Maximum depth of the trees
    "min_samples_split": [2, 5, 10],  # Minimum number of samples required to split an internal node
    "min_samples_leaf": [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# 3. Model Evaluation
y_pred = best_model.predict(X_test)
f1 = f1_regression(y_test, y_pred)  # Use the adapted F1 score for regression

print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score: {f1}")

In [14]:
# Binning target variable
y_bins = pd.qcut(y, q=10, labels=False, duplicates="drop")

X_train, X_test, y_train_bins, y_test_bins = train_test_split(X, y_bins, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)  # Split original target for evaluation

# Model and parameter grid
model = GaussianNB()
param_grid = {
    "var_smoothing": np.logspace(-9, -5, 10)  # Smoothing parameter
}

# GridSearchCV for model tuning
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", cv=5)
grid_search.fit(X_train, y_train_bins)

# Best model and predictions
best_model = grid_search.best_estimator_
predicted_classes = best_model.predict(X_test)

# Map classes to predicted values using median
y_pred = np.zeros(len(predicted_classes))
for class_val in np.unique(predicted_classes):
    class_indices = np.where(predicted_classes == class_val)
    y_pred[class_indices] = np.median(y_train[y_train_bins == class_val])

# Custom F1 Score
def f1_regression(y_true, y_pred, threshold=50000):
    y_pred_binary = (y_pred >= threshold).astype(int)
    y_true_binary = (y_true >= threshold).astype(int)
    return f1_score(y_true_binary, y_pred_binary)

f1 = f1_regression(y_test, y_pred)

# Standard Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Results
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"F1 Score (Threshold): {f1}")
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R² Score: {r2}")

Best Hyperparameters: {'var_smoothing': np.float64(5.99484250318941e-08)}
F1 Score (Threshold): 0.14114557689151486


In [15]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define KNeighborsRegressor and parameter grid
model = KNeighborsRegressor()
param_grid = {
    "n_neighbors": [3, 5, 7, 9, 11],
    "weights": ["uniform", "distance"],
    "p": [1, 2],
}

# Perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Extract the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predictions and Evaluation
y_pred = best_model.predict(X_test_scaled)

# Custom F1 Score
f1 = f1_regression(y_test, y_pred)

# Standard Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score (Threshold): {f1}")
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R² Score: {r2}")

Best Hyperparameters: {'n_neighbors': 11, 'p': 2, 'weights': 'distance'}
F1 Score (Threshold): 0.8958917948142328


In [16]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define XGBRegressor and parameter grid
model = XGBRegressor(random_state=42)
param_grid = {
    "n_estimators": [100, 200, 300], # Number of boosting rounds
    "learning_rate": [0.01, 0.1, 0.2], # Step size shrinkage used in update to prevents overfitting
    "max_depth": [3, 5, 7], # Maximum depth of a tree
    "subsample": [0.8, 0.9, 1.0], # Subsample ratio of the training instances
    "colsample_bytree": [0.8, 0.9, 1.0] # Subsample ratio of columns when constructing each tree
}

# Perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Extract the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predictions and Evaluation
y_pred = best_model.predict(X_test_scaled)

# Custom F1 Score
f1 = f1_regression(y_test, y_pred)

# Standard Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Best Hyperparameters: {best_params}")
print(f"F1 Score (Threshold): {f1}")
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R² Score: {r2}")

Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9}
F1 Score (Threshold): 0.8475163748502689


In [None]:
features = [
    "experience_derived",
    "workhours_derived",
    "employee_count",
    "work_type_enc",
    "formatted_experience_level_enc",
    "location_enc",
    "follower_count",
]  # Choose relevant features
target = "normalized_salary"

# Before splitting, remove rows with NaN in the target variable
df_job_postings_final = df_job_postings.dropna(subset=[target])

X = df_job_postings_final[features]
y = df_job_postings_final[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)  # Split data

# 1. Build the Neural Network Model
model = Sequential()
model.add(
    Dense(64, activation="relu", input_shape=(X_train.shape[1],))
)  # Input layer with 64 neurons
model.add(Dropout(0.2))  # Dropout layer for regularization
model.add(Dense(32, activation="relu"))  # Hidden layer with 32 neurons
model.add(Dropout(0.2))
model.add(Dense(1))  # Output layer with 1 neuron (for regression)

# 2. Compile the Model
model.compile(
    optimizer="adam", loss="mean_squared_error"
)  # Using Adam optimizer and MSE loss

# 3. Train the Model
model.fit(
    X_train, y_train, epochs=100, batch_size=32, verbose=1
)  # Adjust epochs and batch size as needed

# 4. Model Evaluation
y_pred = model.predict(X_test)
y_pred = y_pred.flatten()  # Flatten the predictions


# Create a scorer function for F1 score (adapted for regression)
def f1_regression(y_true, y_pred, threshold=50000):  # Adjust threshold as needed
    y_pred_binary = (y_pred >= threshold).astype(int)
    y_true_binary = (y_true >= threshold).astype(int)
    return f1_score(y_true_binary, y_pred_binary)


f1 = f1_regression(y_test, y_pred)  # Use the adapted F1 score for regression

print(f"F1 Score: {f1}")

## **Job Recommendation**