## Import and Clean Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
raw_data = pd.read_csv("../data/Salary.csv")
education_levels = ["high school", "bachelor degree", "masters degree", "phd"]
raw_data

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0
...,...,...,...,...,...,...,...,...,...
6679,49.0,Female,3,Director of Marketing,20.0,200000.0,UK,Mixed,0
6680,32.0,Male,0,Sales Associate,3.0,50000.0,Australia,Australian,0
6681,30.0,Female,1,Financial Manager,4.0,55000.0,China,Chinese,0
6682,46.0,Male,2,Marketing Manager,14.0,140000.0,China,Korean,0


In [3]:
df = raw_data[["Job Title","Education Level","Years of Experience","Country","Salary"]]
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
isomapping = {"usa":"us", "china":"cn","australia":"au","uk":"uk","canada":"ca"}
df['country'] = df['country'].str.lower().map(isomapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country'] = df['country'].str.lower().map(isomapping)


In [5]:
df['job_title']=df['job_title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['job_title']=df['job_title'].str.lower()


In [6]:
df['education_level']

0       1
1       2
2       3
3       1
4       2
       ..
6679    3
6680    0
6681    1
6682    2
6683    0
Name: education_level, Length: 6684, dtype: int64

In [7]:
sampled_df = df.sample(n=1000, replace=True, random_state=42)


## Setting up Lin Reg Model with word embeddings

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
features = ["job_title","education_level", "years_of_experience", "country"]

In [10]:
X = sampled_df[features]
y = sampled_df['salary']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
def evaluate_regression_metrics(y_true, y_pred):
    """
    Evaluate regression metrics: Mean Absolute Error, Mean Squared Error, and R-squared.

    Parameters:
    y_true (array-like): Actual target values.
    y_pred (array-like): Predicted target values.

    Returns:
    dict: A dictionary containing MAE, MSE, and R-squared.
    """
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return {
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'R-squared': r2
    }

In [13]:
class SentenceTransformerEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="nomic-ai/nomic-embed-text-v1"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True)
        self.model_name = model_name


    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Encode the text data into embeddings
        return self.model.encode(X.tolist())

In [14]:
testencoder = SentenceTransformerEncoder()


<All keys matched successfully>


array([[-1.89172104e-02,  2.17174776e-02, -3.16437855e-02,
        -3.48579846e-02, -4.34047468e-02,  5.42021058e-02,
         4.22584414e-02,  2.57984661e-02,  5.88271068e-03,
        -7.82303512e-03, -3.05450168e-02,  1.99776050e-03,
         3.77475098e-02,  1.61173958e-02, -2.33827275e-04,
        -1.81740951e-02,  4.70037349e-02, -3.03299800e-02,
        -1.85661893e-02, -6.36119172e-02, -9.83864721e-03,
        -2.40593441e-02, -2.94035450e-02, -3.77770923e-02,
         1.27935231e-01,  1.02397474e-02,  2.22618654e-02,
         3.58354300e-02, -2.95041781e-02, -1.24794040e-02,
         4.79243770e-02, -5.13793668e-03, -2.14560772e-03,
        -6.41320087e-03, -2.02097390e-02, -6.76152855e-02,
        -6.37955358e-03,  3.37701151e-03,  1.47272050e-02,
         7.73539916e-02,  3.84754427e-02, -4.89214025e-02,
        -1.45329740e-02, -1.96289234e-02, -1.33265024e-02,
         5.19962311e-02,  1.00266980e-02,  7.87181705e-02,
         3.71829234e-02,  4.18935670e-03, -3.84247340e-0

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('job_title', SentenceTransformerEncoder(), 'job_title'),  # Encode job_title
        ('years_of_experience', StandardScaler(), ['years_of_experience']),  # Scale years_of_experience
        ('country', OneHotEncoder(handle_unknown='ignore'), ['country'])  # One-hot encode country
    ]
)

word_processing_pipeline = Pipeline([
    ('preprocessor', preprocessor),  
    (('linreg'), LinearRegression())
])

<All keys matched successfully>


In [16]:
word_processing_pipeline.fit(X_train, y_train)

<All keys matched successfully>


0,1,2
,steps,"[('preprocessor', ...), ('linreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('job_title', ...), ('years_of_experience', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,model_name,'nomic-ai/nomic-embed-text-v1'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [17]:
word_processing_pipeline.score(X_test, y_test)

0.8060153286422945

In [18]:
evaluate_regression_metrics(word_processing_pipeline.predict(X_test), y_test)

{'Mean Absolute Error': 17519.27089207211,
 'Mean Squared Error': 552511509.8639787,
 'R-squared': 0.7880120093660397}

## Exporting the Model

In [19]:
import pickle as pkl

In [20]:
filepath = "../models/word_embedding_model.pkl"
with open(filepath, "wb") as f:
    pkl.dump(word_processing_pipeline, f)