In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import xgboost as xgb

import warnings

# To ignore all warnings
warnings.filterwarnings("ignore")

2023-11-11 18:09:05.534127: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preprocessing

In [2]:
filepath = '../data/'

In [3]:
df = pd.read_csv('../data/clean_job_postings_w_salary.csv')
df

Unnamed: 0.1,Unnamed: 0,company,job title,location,job description,salary estimate,company_size,company_type,company_sector,company_industry,...,rating,clean_job_title,clean_job_description,programming_languages,skills,seniority,job_category,num_of_skills,num_of_programming_languages,clean_job_description_filtered
0,0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,...,4.4,data & applied scientist,microsoft is a key part of the company’s cloud...,"['python', 'r', 'sql']","['analysis', 'analytics', 'clustering', 'compu...",junior,data scientist,36,3,microsoft key part company cloud strategy over...
1,1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,...,4.0,data scientist or bioinformatician remote,center information the quantitative biomedical...,"['python', 'perl', 'r']","['analysis', 'bioinformatics', 'biology', 'bio...",junior,data scientist,21,3,center information quantitative biomedical res...
2,2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,...,4.9,data scientist growth,about us we're on a mission to make it possibl...,"['python', 'r', 'sql']","['analytics', 'business', 'creative', 'dashboa...",senior,data scientist,15,3,u mission possible person team company able ta...
3,3,Net2Aspire,Jr. Data Scientist,Remote, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,Company - Public,,,...,2.0,jr. data scientist, apply statistical and machine learning metho...,[],"['business', 'customer experience', 'dashboard...",junior,data scientist,13,0,apply statistical machine method specific busi...
4,4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,Company - Private,,,...,0.0,data scientist,"over the last few decades, technological innov...","['python', 'go', 'rust', 'hack', 'sql']","['access', 'algorithms', 'api', 'aws', 'comput...",mid,data scientist,20,5,last decade technological innovation key ingre...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,...,3.1,senior machine learning engineer python ml d...,analytics & behavior change is an innovation e...,"['python', 'r']","['agile', 'algorithms', 'ambitious', 'analytic...",senior,machine learning engineer,34,2,analytics behavior change innovation engine en...
763,763,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,...,4.0,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...
764,764,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,...,4.3,machine learning software developer,"laboratory description mit lincoln laboratory,...","['python', 'java', 'c', 'c++', 'julia', 'reason']","['algorithms', 'analysis', 'applied mathematic...",mid,machine learning engineer,16,6,laboratory description mit lincoln laboratory ...
765,765,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,...,4.0,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...


In [4]:
# Encoding seniority using an ordinal encoder to capture different levels of employees
df['seniority'] = pd.Categorical(df['seniority'], categories=['junior', 'mid', 'senior'], ordered=True)
ordinal_encoder = OrdinalEncoder(categories=[['junior', 'mid', 'senior']], dtype=int)
df['seniority_encoded'] = ordinal_encoder.fit_transform(df[['seniority']])

df

Unnamed: 0.1,Unnamed: 0,company,job title,location,job description,salary estimate,company_size,company_type,company_sector,company_industry,...,clean_job_title,clean_job_description,programming_languages,skills,seniority,job_category,num_of_skills,num_of_programming_languages,clean_job_description_filtered,seniority_encoded
0,0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,...,data & applied scientist,microsoft is a key part of the company’s cloud...,"['python', 'r', 'sql']","['analysis', 'analytics', 'clustering', 'compu...",junior,data scientist,36,3,microsoft key part company cloud strategy over...,0
1,1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,...,data scientist or bioinformatician remote,center information the quantitative biomedical...,"['python', 'perl', 'r']","['analysis', 'bioinformatics', 'biology', 'bio...",junior,data scientist,21,3,center information quantitative biomedical res...,0
2,2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,...,data scientist growth,about us we're on a mission to make it possibl...,"['python', 'r', 'sql']","['analytics', 'business', 'creative', 'dashboa...",senior,data scientist,15,3,u mission possible person team company able ta...,2
3,3,Net2Aspire,Jr. Data Scientist,Remote, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,Company - Public,,,...,jr. data scientist, apply statistical and machine learning metho...,[],"['business', 'customer experience', 'dashboard...",junior,data scientist,13,0,apply statistical machine method specific busi...,0
4,4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,Company - Private,,,...,data scientist,"over the last few decades, technological innov...","['python', 'go', 'rust', 'hack', 'sql']","['access', 'algorithms', 'api', 'aws', 'comput...",mid,data scientist,20,5,last decade technological innovation key ingre...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,...,senior machine learning engineer python ml d...,analytics & behavior change is an innovation e...,"['python', 'r']","['agile', 'algorithms', 'ambitious', 'analytic...",senior,machine learning engineer,34,2,analytics behavior change innovation engine en...,2
763,763,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,...,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...,2
764,764,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,...,machine learning software developer,"laboratory description mit lincoln laboratory,...","['python', 'java', 'c', 'c++', 'julia', 'reason']","['algorithms', 'analysis', 'applied mathematic...",mid,machine learning engineer,16,6,laboratory description mit lincoln laboratory ...,1
765,765,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,...,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...,2


In [5]:

with open(filepath +'lang.pkl', 'rb') as f:
    lang = pickle.load(f)
prog_lang = lang.values.flatten().tolist()

with open(filepath +'skills.pkl', 'rb') as f:
    skills = pickle.load(f)
skills = skills.values.flatten().tolist()

In [6]:
for lang in prog_lang:
    df[lang] = df['programming_languages'].apply(lambda x: 1 if lang in x else 0)

In [7]:
for skill in skills:
    df[skill] = df['skills'].apply(lambda x: 1 if skill in x else 0)
df

Unnamed: 0.1,Unnamed: 0,company,job title,location,job description,salary estimate,company_size,company_type,company_sector,company_industry,...,xhtml,xilinx ise,xml publisher,xsl,yarn,yii,zero inflated models,zk,zoom,zycus
0,0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,...,0,0,0,0,0,0,0,0,0,0
1,1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,...,0,0,0,0,0,0,0,0,0,0
2,2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,...,0,0,0,0,0,0,0,0,0,0
3,3,Net2Aspire,Jr. Data Scientist,Remote, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,Company - Public,,,...,0,0,0,0,0,0,0,0,0,0
4,4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,Company - Private,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,...,0,0,0,0,0,0,0,0,0,0
763,763,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,...,0,0,0,0,0,0,0,0,0,0
764,764,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,...,0,0,0,0,0,0,0,0,0,0
765,765,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Encoding company_revenue using an ordinal encoder to capture revenue
default_revenue = 'Unknown / Non-Applicable'
df['company_revenue'].fillna(default_revenue, inplace=True)

revenue_list_reordered = [
    'Less than $1 million (USD)',
    '$1 to $5 million (USD)',
    '$5 to $25 million (USD)',
    '$25 to $100 million (USD)',
    '$100 to $500 million (USD)',
    '$500 million to $1 billion (USD)',
    '$1 to $5 billion (USD)',
    '$5 to $10 billion (USD)',
    '$10+ billion (USD)',
    'Unknown / Non-Applicable'
]

df['company_revenue'] = pd.Categorical(df['company_revenue'], categories=revenue_list_reordered, ordered=True)
ordinal_encoder = OrdinalEncoder(categories=[revenue_list_reordered], dtype=int)
df['company_revenue_encoded'] = ordinal_encoder.fit_transform(df[['company_revenue']])

In [9]:
df['company_size'].unique().tolist()

['10000+ Employees',
 '201 to 500 Employees',
 'Unknown',
 '1 to 50 Employees',
 '1001 to 5000 Employees',
 '501 to 1000 Employees',
 nan,
 '5001 to 10000 Employees',
 '51 to 200 Employees']

In [10]:
# Encoding company_employees using an ordinal encoder to capture revenue
default_size = 'Unknown'
df['company_size'].fillna(default_size, inplace=True)

size_list_reordered = [
    '1 to 50 Employees',
    '51 to 200 Employees',
    '201 to 500 Employees',
    '501 to 1000 Employees',
    '1001 to 5000 Employees',
    '5001 to 10000 Employees',
    '10000+ Employees',
    'Unknown'
]

df['company_size'] = pd.Categorical(df['company_size'], categories=size_list_reordered, ordered=True)
ordinal_encoder = OrdinalEncoder(categories=[size_list_reordered], dtype=int)
df['company_size_encoded'] = ordinal_encoder.fit_transform(df[['company_size']])

In [11]:
df_encoded = pd.get_dummies(df, columns=['location', 'company_type', 'job_category', 'company_sector', 'company_industry'])
df_encoded

Unnamed: 0.1,Unnamed: 0,company,job title,job description,salary estimate,company_size,company_founded,company_revenue,salary,rating,...,company_industry_Sporting Goods Stores,company_industry_Sports & Recreation,company_industry_Staffing & Subcontracting,company_industry_State & Regional Agencies,company_industry_Stock Exchanges,company_industry_Taxi & Car Services,company_industry_Telecommunications Services,company_industry_Transportation Equipment Manufacturing,company_industry_Video Game Publishing,company_industry_Wholesale
0,0,Microsoft,Data & Applied Scientist,Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,1975.0,$10+ billion (USD),123486.0,4.4,...,0,0,0,0,0,0,0,0,0,0
1,1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,1943.0,$1 to $5 billion (USD),93500.0,4.0,...,0,0,0,0,0,0,0,0,0,0
2,2,Notion,"Data Scientist, Growth",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,2016.0,Unknown / Non-Applicable,137853.0,4.9,...,0,0,0,0,0,0,0,0,0,0
3,3,Net2Aspire,Jr. Data Scientist, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,,Unknown / Non-Applicable,72500.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4,4,Ntropy Network,Data Scientist,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,,Unknown / Non-Applicable,155000.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,1963.0,$10+ billion (USD),135000.0,3.1,...,0,0,0,0,0,0,0,0,0,0
763,763,Morgan Stanley,Machine Learning Researcher,Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,1935.0,$10+ billion (USD),143796.0,4.0,...,0,0,0,0,0,0,0,0,0,0
764,764,MIT Lincoln Laboratory,Machine Learning Software Developer,Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,1951.0,Unknown / Non-Applicable,117724.0,4.3,...,0,0,0,0,0,0,0,0,0,0
765,765,Morgan Stanley,Machine Learning Researcher,Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,1935.0,$10+ billion (USD),143796.0,4.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_encoded.drop(columns=['Unnamed: 0',
 'company',
 'job title',
 'job description',
 'salary estimate',
 'company_size',
 'company_founded',
 'company_revenue',
 'rating',
 'clean_job_title',
 'clean_job_description',
 'programming_languages',
 'skills',
 'seniority',
 'clean_job_description_filtered',], axis=1, inplace=True)


In [13]:
df_encoded.to_csv(filepath + "Encoded_data.csv")

In [14]:
X = df_encoded.drop('salary', axis =1)
y = df_encoded['salary']

In [15]:
# First, split into train and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split the temporary set into test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

Train set shape: (613, 2817) (613,)
Test set shape: (77, 2817) (77,)
Validation set shape: (77, 2817) (77,)


## Models

### Model 1: Linear regression

In [16]:
model = LinearRegression()
model.fit(X_train, y_train)


# Train the model
model.fit(X_train, y_train)

y_pred_in_sample = model.predict(X_train)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')


In sample MSE: 2994934.2370251585
In sample R2: 0.9979859642825738
Out of sample MSE: 1.0570225221645438e+28
Out of sample R2: -5.340197189458185e+18


### Model 2: CART

In [17]:
# Create a decision tree regressor
model = DecisionTreeRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

y_pred_in_sample = model.predict(X_train)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')

In sample MSE: 2994928.2797172377
In sample R2: 0.9979859682887489
Out of sample MSE: 1776818437.4187014
Out of sample R2: 0.10233125342964311


### Model 3: Random Forest

In [18]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

y_pred_in_sample = model.predict(X_train)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')


In sample MSE: 126213420.72377051
In sample R2: 0.9151239001465656
Out of sample MSE: 1446813959.8386157
Out of sample R2: 0.26905324342783354


### Model 4: Support Vector Regressor


In [19]:
# Standardize features (important for SVR)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a support vector regression model
model = SVR(kernel='linear', C=1.0)

# Train the model
model.fit(X_train_scaled, y_train)

y_pred_in_sample = model.predict(X_train_scaled)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test_scaled)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')

In sample MSE: 1336845813.0284731
In sample R2: 0.10099688238716265
Out of sample MSE: 1993892564.7696807
Out of sample R2: -0.007337047905125793


### Model 5: Neural Network


In [20]:
# Build a simple neural network using Keras
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer for regression task
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

y_pred_in_sample = model.predict(X_train_scaled)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test_scaled)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
In sample MSE: 2944025839.611091
In sample R2: -0.9798007985284021
Out of sample MSE: 6765964965.124168
Out of sample R2: -2.4182419327016125


### Model 6: XGBoost

In [21]:
# Create an XGBoost regressor
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)

y_pred_in_sample = model.predict(X_train_scaled)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test_scaled)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')


In sample MSE: 22494522.295501504
In sample R2: 0.984872866058461
Out of sample MSE: 1373308771.5436008
Out of sample R2: 0.3061888949123275
