In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import re
from nltk.tokenize.regexp import RegexpTokenizer
warnings.filterwarnings("ignore")


In [None]:
data = pd.read_csv("data/salary_data_cleaned.csv")
data.head(10)


In [None]:
#### Cleaning
def clean_text(text : str) -> str:
    """
    Cleans the input text by keeping only alphanumeric characters, spaces, and newline characters (\n).
    Args:
        text (str): The input text to clean.
    Returns:
        str: The cleaned text.
    """
    # Define a regex pattern to match alphanumeric characters, spaces, and newlines
    pattern = r"[^a-zA-Z0-9\s\n'’]"
    # Substitute all non-matching characters with an empty string
    cleaned_text = re.sub(pattern, "", text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

data["Job Description"] = data["Job Description"].apply(clean_text)
data.head(10)

In [None]:
### Tokenization
tokenizer = RegexpTokenizer(r"[A-Za-z]+(?:’[A-Za-z]+)?|\$[\d\.]+|\S+")

data["tokens"] = data["Job Description"].apply(tokenizer.tokenize)



In [None]:
### Stop words

from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words = stop_words + [i.replace("'","’") for i in stop_words.copy() if "'" in i]

data["tokens_clean"] = data["tokens"].apply(lambda x : [i for i in x if i not in stop_words])
data["tokens_clean"]

In [None]:
#### Lemmatization

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

data["lemmas"] = data["tokens_clean"].apply(lambda x: [lemmatizer.lemmatize(i, pos = "v") for i in x])
data["lemmas"]

In [None]:
### Combining all

data["final_text"] = data["lemmas"].apply(lambda x : " ".join(x))

In [None]:
### Machine learning model

from sklearn.model_selection import train_test_split

x = data["final_text"]
y = data["avg_salary"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)
x_training , x_val, y_training, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=123)
print(x_training.shape, x_test.shape, x_val.shape)

In [None]:
### Vectorizing

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(max_features=1000)

x_train_vect = vect.fit_transform(x_training)
x_val_vect = vect.transform(x_val)
x_test_vect = vect.transform(x_test)

In [None]:
### Modeling


from catboost import CatBoostRegressor, Pool
train_pool = Pool(x_train_vect, y_training)
eval_pool = Pool(x_val_vect, y_val)
params = {
    "learning_rate": 0.1,
    "iterations": 10000,
    "depth": 5,
    "verbose": False,
    "l2_leaf_reg": 0.1
}
model = CatBoostRegressor(**params )
model.fit(train_pool, early_stopping_rounds = 100, eval_set = eval_pool)

In [None]:
### Evaluating model

from sklearn.metrics import mean_absolute_error, mean_squared_error

preds = model.predict(x_test_vect)
print("MAE:", mean_absolute_error(y_test, preds))
print("MSE:", mean_squared_error(y_test, preds))

In [None]:
### Analyzing errors

import seaborn as sns

sns.scatterplot(x=y_test, y=preds)
sns.lineplot(x=y_test, y=y_test, color = "red")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Error analysis")
plt.show()

In [None]:
### Cross validation

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
kf.get_n_splits(x_train_vect)

mae_cross_validation = []
mse_cross_validation = []
mae_training_cross_validation = []
for i, (train_index, test_index) in enumerate(kf.split(x_train_vect)):
    model.fit(x_train_vect.toarray()[train_index], y_training.values[train_index], early_stopping_rounds = 100, eval_set = eval_pool)
    preds = model.predict(x_train_vect.toarray()[test_index])
    preds_training = model.predict(x_train_vect.toarray()[train_index])
    mae = mean_absolute_error(y_training.values[test_index], preds)
    mae_training = mean_absolute_error(y_training.values[train_index], preds_training)
    mae_cross_validation.append(mae)
    mae_training_cross_validation.append(mae_training)
    mse = mean_squared_error(y_training.values[test_index], preds)
    mse_cross_validation.append(mse)

In [None]:
### Analyzing cross validation results

print(mae_cross_validation)
print(mae_training_cross_validation)

In [None]:
### Hyperparameter tuning
import optuna


def objective(trial):
    # Define the hyperparameter search space
    params = {
        "iterations" : 10000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "verbose": False,  # Disable logging
        "random_seed": 123,  # Set a fixed random seed for reproducibility
    }
    model = CatBoostRegressor(**params )
    kf = KFold(n_splits=5)
    kf.get_n_splits(x_train_vect)

    mae_cross_validation = []
    mse_cross_validation = []
    mae_training_cross_validation = []
    for i, (train_index, test_index) in enumerate(kf.split(x_train_vect)):
        model.fit(x_train_vect.toarray()[train_index], y_training.values[train_index], early_stopping_rounds = 100, eval_set = eval_pool)
        preds = model.predict(x_train_vect.toarray()[test_index])
        preds_training = model.predict(x_train_vect.toarray()[train_index])
        mae = mean_absolute_error(y_training.values[test_index], preds)
        mae_training = mean_absolute_error(y_training.values[train_index], preds_training)
        mae_cross_validation.append(mae)
        mae_training_cross_validation.append(mae_training)
        mse = mean_squared_error(y_training.values[test_index], preds)
        mse_cross_validation.append(mse)

    mae_val = np.mean(mae_training_cross_validation)
    mae_test = np.mean(mae_cross_validation)
    return abs(mae_val - mae_test)

study = optuna.create_study(direction="minimize")  # Minimize overfitting
study.optimize(objective, n_trials=20)

In [None]:
best_params = {
    'learning_rate': 0.010004323405318344,
     'depth': 4,
     'l2_leaf_reg': 0.49946428453675523,
     'random_strength': 0.10560927281613303,
     'bagging_temperature': 0.27154199895296605,
     'border_count': 34,
     'min_data_in_leaf': 45,
    "verbose": False,  # Disable logging
    "random_seed": 123,
}

kf = KFold(n_splits=5)
kf.get_n_splits(x_train_vect)
model = CatBoostRegressor(**best_params )
mae_cross_validation = []
mse_cross_validation = []
mae_training_cross_validation = []
for i, (train_index, test_index) in enumerate(kf.split(x_train_vect)):
    model.fit(x_train_vect.toarray()[train_index], y_training.values[train_index], early_stopping_rounds = 100, eval_set = eval_pool)
    preds = model.predict(x_train_vect.toarray()[test_index])
    preds_training = model.predict(x_train_vect.toarray()[train_index])
    mae = mean_absolute_error(y_training.values[test_index], preds)
    mae_training = mean_absolute_error(y_training.values[train_index], preds_training)
    mae_cross_validation.append(mae)
    mae_training_cross_validation.append(mae_training)
    mse = mean_squared_error(y_training.values[test_index], preds)
    mse_cross_validation.append(mse)

In [None]:
print(mae_cross_validation)
print(mae_training_cross_validation)

model.fit(train_pool, early_stopping_rounds = 100, eval_set = eval_pool)
preds = model.predict(x_test_vect)
print("MAE:", mean_absolute_error(y_test, preds))
print("MSE:", mean_squared_error(y_test, preds))

In [None]:
sns.scatterplot(x=y_test, y=preds)
sns.lineplot(x=y_test, y=y_test, color = "red")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Error analysis")
plt.show()

In [None]:
### All in a sklearn pipeline

### Preprocessing steps
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin


class TextPreprocessing(TransformerMixin, BaseEstimator):
    def __init__(self, text_column, lang = "english"):
        self.text_column = text_column
        self.lang = lang
    def fit(self, X, y=None):
        def clean_text(text : str) -> str:
            """
            Cleans the input text by keeping only alphanumeric characters, spaces, and newline characters (\n).
            Args:
                text (str): The input text to clean.
            Returns:
                str: The cleaned text.
            """
            # Define a regex pattern to match alphanumeric characters, spaces, and newlines
            pattern = r"[^a-zA-Z0-9\s\n'’]"
            # Substitute all non-matching characters with an empty string
            cleaned_text = re.sub(pattern, "", text)
            cleaned_text = cleaned_text.lower()
            return cleaned_text
        ### Cleaning text

        X = X.copy()
        X[self.text_column] = X[self.text_column].apply(clean_text)

        ### Tokenization

        tokenizer = RegexpTokenizer(r"[A-Za-z]+(?:’[A-Za-z]+)?|\$[\d\.]+|\S+")

        X[self.text_column] = X[self.text_column].apply(tokenizer.tokenize)

        ### Removing stopwords

        stop_words = stopwords.words(self.lang)
        stop_words = stop_words + [i.replace("'","’") for i in stop_words.copy() if "'" in i]
        X[self.text_column] = X[self.text_column].apply(lambda x : [i for i in x if i not in stop_words])

        ### Lemmatization

        lemmatizer = WordNetLemmatizer()

        X[self.text_column] = X[self.text_column].apply(lambda x: [lemmatizer.lemmatize(i, pos = "v") for i in x])

        ### Joining all together
        return X[self.text_column].apply(lambda x : " ".join(x))

    def fit_transform(self, X, y=None):
        return self.fit(X, y)

    def transform(self, X, y = None):
        return self.fit_transform(X)


In [None]:

text_feature = ["Job Description"]
text_transformer = Pipeline(
    steps=[
        ("preprocessor", TextPreprocessing(text_column = "Job Description", lang = "english")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_transformer, text_feature),
        ("vectorizer", )
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", TextPreprocessing(text_column = "Job Description", lang = "english")),
        ("vectorize", TfidfVectorizer(max_features=1000)),
        ("catboost", CatBoostRegressor(**best_params))
    ]
)



train, test = train_test_split(data, test_size=0.2, random_state=123)

clf.fit(train, train["avg_salary"])

In [None]:
preds = clf.predict(test)

mean_absolute_error(test["avg_salary"], preds)

In [None]:
### Export as pickle

import pickle

pickle.dump(clf, open("../myapp/predict_salary/model.pkl", "wb"))