In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical




import warnings

# To ignore all warnings
warnings.filterwarnings("ignore")

2023-12-07 22:21:06.485810: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preprocessing

In [4]:
filepath = '../data/'

In [5]:
df = pd.read_csv('../data/clean_job_postings_w_salary.csv').drop("Unnamed: 0", axis=1)
df

Unnamed: 0,company,job title,location,job description,salary estimate,company_size,company_type,company_sector,company_industry,company_founded,...,rating,clean_job_title,clean_job_description,programming_languages,skills,seniority,job_category,num_of_skills,num_of_programming_languages,clean_job_description_filtered
0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1975.0,...,4.4,data & applied scientist,microsoft is a key part of the company’s cloud...,"['python', 'r', 'sql']","['analysis', 'analytics', 'clustering', 'compu...",junior,data scientist,36,3,microsoft key part company cloud strategy over...
1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,1943.0,...,4.0,data scientist or bioinformatician remote,center information the quantitative biomedical...,"['python', 'perl', 'r']","['analysis', 'bioinformatics', 'biology', 'bio...",junior,data scientist,21,3,center information quantitative biomedical res...
2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,2016.0,...,4.9,data scientist growth,about us we're on a mission to make it possibl...,"['python', 'r', 'sql']","['analytics', 'business', 'creative', 'dashboa...",senior,data scientist,15,3,u mission possible person team company able ta...
3,Net2Aspire,Jr. Data Scientist,Remote, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,Company - Public,,,,...,2.0,jr. data scientist, apply statistical and machine learning metho...,[],"['business', 'customer experience', 'dashboard...",junior,data scientist,13,0,apply statistical machine method specific busi...
4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,Company - Private,,,,...,0.0,data scientist,"over the last few decades, technological innov...","['python', 'go', 'rust', 'hack', 'sql']","['access', 'algorithms', 'api', 'aws', 'comput...",mid,data scientist,20,5,last decade technological innovation key ingre...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,1963.0,...,3.1,senior machine learning engineer python ml d...,analytics & behavior change is an innovation e...,"['python', 'r']","['agile', 'algorithms', 'ambitious', 'analytic...",senior,machine learning engineer,34,2,analytics behavior change innovation engine en...
763,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,4.0,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...
764,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,1951.0,...,4.3,machine learning software developer,"laboratory description mit lincoln laboratory,...","['python', 'java', 'c', 'c++', 'julia', 'reason']","['algorithms', 'analysis', 'applied mathematic...",mid,machine learning engineer,16,6,laboratory description mit lincoln laboratory ...
765,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,4.0,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...


In [6]:
# Encoding seniority using an ordinal encoder to capture different levels of employees
df['seniority'] = pd.Categorical(df['seniority'], categories=['junior', 'mid', 'senior'], ordered=True)
ordinal_encoder = OrdinalEncoder(categories=[['junior', 'mid', 'senior']], dtype=int)
df['seniority_encoded'] = ordinal_encoder.fit_transform(df[['seniority']])

df

Unnamed: 0,company,job title,location,job description,salary estimate,company_size,company_type,company_sector,company_industry,company_founded,...,clean_job_title,clean_job_description,programming_languages,skills,seniority,job_category,num_of_skills,num_of_programming_languages,clean_job_description_filtered,seniority_encoded
0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1975.0,...,data & applied scientist,microsoft is a key part of the company’s cloud...,"['python', 'r', 'sql']","['analysis', 'analytics', 'clustering', 'compu...",junior,data scientist,36,3,microsoft key part company cloud strategy over...,0
1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,1943.0,...,data scientist or bioinformatician remote,center information the quantitative biomedical...,"['python', 'perl', 'r']","['analysis', 'bioinformatics', 'biology', 'bio...",junior,data scientist,21,3,center information quantitative biomedical res...,0
2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,2016.0,...,data scientist growth,about us we're on a mission to make it possibl...,"['python', 'r', 'sql']","['analytics', 'business', 'creative', 'dashboa...",senior,data scientist,15,3,u mission possible person team company able ta...,2
3,Net2Aspire,Jr. Data Scientist,Remote, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,Company - Public,,,,...,jr. data scientist, apply statistical and machine learning metho...,[],"['business', 'customer experience', 'dashboard...",junior,data scientist,13,0,apply statistical machine method specific busi...,0
4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,Company - Private,,,,...,data scientist,"over the last few decades, technological innov...","['python', 'go', 'rust', 'hack', 'sql']","['access', 'algorithms', 'api', 'aws', 'comput...",mid,data scientist,20,5,last decade technological innovation key ingre...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,1963.0,...,senior machine learning engineer python ml d...,analytics & behavior change is an innovation e...,"['python', 'r']","['agile', 'algorithms', 'ambitious', 'analytic...",senior,machine learning engineer,34,2,analytics behavior change innovation engine en...,2
763,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...,2
764,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,1951.0,...,machine learning software developer,"laboratory description mit lincoln laboratory,...","['python', 'java', 'c', 'c++', 'julia', 'reason']","['algorithms', 'analysis', 'applied mathematic...",mid,machine learning engineer,16,6,laboratory description mit lincoln laboratory ...,1
765,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,machine learning researcher,machine learning researcher job number posting...,"['python', 'java', 'c', 'c++', 'r']","[""bachelor's degree"", 'banking', 'business', '...",senior,machine learning engineer,17,5,machine researcher job number date jan primary...,2


In [7]:

with open(filepath +'lang.pkl', 'rb') as f:
    lang = pickle.load(f)
prog_lang = lang.values.flatten().tolist()

with open(filepath +'skills.pkl', 'rb') as f:
    skills = pickle.load(f)
skills = skills.values.flatten().tolist()

In [8]:
prog_lang_counts = {}

for lang in prog_lang:
    prog_lang_counts[lang] = sum(df['programming_languages'].apply(lambda x: 1 if lang in x else 0))

for lang in prog_lang:
    if prog_lang_counts[lang] < 20:
        del prog_lang_counts[lang]

prog_lang_counts

{'javascript': 31,
 'shell': 22,
 'python': 517,
 'java': 131,
 'c': 189,
 'c++': 25,
 'go': 32,
 'scala': 54,
 'r': 351,
 'matlab': 40,
 'sas': 45,
 'sql': 396,
 'q': 399,
 'j': 151,
 'tla': 40,
 'v': 154,
 'reason': 37,
 'al': 62,
 'org': 25}

In [9]:
for lang in prog_lang_counts.keys():
    df[lang] = df['programming_languages'].apply(lambda x: 1 if lang in x else 0)
df

Unnamed: 0,company,job title,location,job description,salary estimate,company_size,company_type,company_sector,company_industry,company_founded,...,matlab,sas,sql,q,j,tla,v,reason,al,org
0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1975.0,...,0,0,1,1,0,0,0,0,0,0
1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,1943.0,...,0,0,0,0,0,0,0,0,0,0
2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,2016.0,...,0,0,1,1,0,0,0,0,0,0
3,Net2Aspire,Jr. Data Scientist,Remote, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,Company - Public,,,,...,0,0,0,0,0,0,0,0,0,0
4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,Company - Private,,,,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,1963.0,...,0,0,0,0,0,0,0,0,0,0
763,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,0,0,0,0,1,0,1,0,0,0
764,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,1951.0,...,0,0,0,0,1,0,1,1,0,0
765,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,0,0,0,0,1,0,1,0,0,0


In [10]:
skill_counts = {}

for skill in skills:
    skill_counts[skill] = sum(df['skills'].apply(lambda x: 1 if skill in x else 0))

skill_counts
for skill in skills:
    if skill_counts[skill] < 100:
        del skill_counts[skill]

len(skill_counts.keys())


65

In [117]:
with open("skills_final.pkl", "wb") as fd:
    pickle.dump(list(skill_counts.keys()), fd)

with open("prog_lang.pkl", "wb") as fd:
    pickle.dump(list(prog_lang_counts.keys()), fd)

In [11]:
for skill in skill_counts.keys():
    df[skill] = df['skills'].apply(lambda x: 1 if skill in x else 0)
df

Unnamed: 0,company,job title,location,job description,salary estimate,company_size,company_type,company_sector,company_industry,company_founded,...,search,software,spark,statistics,strategy,tableau,tensorflow,testing,training,visualization
0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1975.0,...,1,0,0,1,1,0,0,1,0,1
1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,1943.0,...,1,0,0,1,0,0,0,0,1,0
2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,2016.0,...,0,1,0,0,0,0,0,0,0,0
3,Net2Aspire,Jr. Data Scientist,Remote, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,Company - Public,,,,...,0,1,0,0,0,1,0,0,0,1
4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,Company - Private,,,,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,1963.0,...,0,1,0,0,0,0,0,0,0,0
763,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,1,1,0,0,1,0,0,0,0,0
764,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,1951.0,...,1,0,0,0,0,0,0,0,0,0
765,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,1935.0,...,1,1,0,0,1,0,0,0,0,0


In [12]:
# Encoding company_revenue using an ordinal encoder to capture revenue
default_revenue = 'Unknown / Non-Applicable'
df['company_revenue'].fillna(default_revenue, inplace=True)

revenue_list_reordered = [
    'Less than $1 million (USD)',
    '$1 to $5 million (USD)',
    '$5 to $25 million (USD)',
    '$25 to $100 million (USD)',
    '$100 to $500 million (USD)',
    '$500 million to $1 billion (USD)',
    '$1 to $5 billion (USD)',
    '$5 to $10 billion (USD)',
    '$10+ billion (USD)',
    'Unknown / Non-Applicable'
]

df['company_revenue'] = pd.Categorical(df['company_revenue'], categories=revenue_list_reordered, ordered=True)
ordinal_encoder = OrdinalEncoder(categories=[revenue_list_reordered], dtype=int)
df['company_revenue_encoded'] = ordinal_encoder.fit_transform(df[['company_revenue']])

In [13]:
df['company_size'].unique().tolist()

['10000+ Employees',
 '201 to 500 Employees',
 'Unknown',
 '1 to 50 Employees',
 '1001 to 5000 Employees',
 '501 to 1000 Employees',
 nan,
 '5001 to 10000 Employees',
 '51 to 200 Employees']

In [14]:
# Encoding company_employees using an ordinal encoder to capture revenue
default_size = 'Unknown'
df['company_size'].fillna(default_size, inplace=True)

size_list_reordered = [
    '1 to 50 Employees',
    '51 to 200 Employees',
    '201 to 500 Employees',
    '501 to 1000 Employees',
    '1001 to 5000 Employees',
    '5001 to 10000 Employees',
    '10000+ Employees',
    'Unknown'
]

df['company_size'] = pd.Categorical(df['company_size'], categories=size_list_reordered, ordered=True)
ordinal_encoder = OrdinalEncoder(categories=[size_list_reordered], dtype=int)
df['company_size_encoded'] = ordinal_encoder.fit_transform(df[['company_size']])

In [15]:
df_encoded = pd.get_dummies(df, columns=['location', 'company_type', 'job_category', 'company_sector', 'company_industry'])
df_encoded

Unnamed: 0,company,job title,job description,salary estimate,company_size,company_founded,company_revenue,salary,rating,clean_job_title,...,company_industry_Sporting Goods Stores,company_industry_Sports & Recreation,company_industry_Staffing & Subcontracting,company_industry_State & Regional Agencies,company_industry_Stock Exchanges,company_industry_Taxi & Car Services,company_industry_Telecommunications Services,company_industry_Transportation Equipment Manufacturing,company_industry_Video Game Publishing,company_industry_Wholesale
0,Microsoft,Data & Applied Scientist,Microsoft 365 is a key part of the company’s c...,$123486 /yr (est.),10000+ Employees,1975.0,$10+ billion (USD),123486.0,4.4,data & applied scientist,...,0,0,0,0,0,0,0,0,0,0
1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Center Information:\nThe Quantitative Biomedic...,$93500 /yr (est.),10000+ Employees,1943.0,$1 to $5 billion (USD),93500.0,4.0,data scientist or bioinformatician remote,...,0,0,0,0,0,0,0,0,0,0
2,Notion,"Data Scientist, Growth",About Us:\nWe're on a mission to make it possi...,$137853 /yr (est.),201 to 500 Employees,2016.0,Unknown / Non-Applicable,137853.0,4.9,data scientist growth,...,0,0,0,0,0,0,0,0,0,0
3,Net2Aspire,Jr. Data Scientist, Apply Statistical and Machine Learning metho...,$72500 /yr (est.),Unknown,,Unknown / Non-Applicable,72500.0,2.0,jr. data scientist,...,0,0,0,0,0,0,0,0,0,0
4,Ntropy Network,Data Scientist,"Over the last few decades, technological innov...",$155000 /yr (est.),1 to 50 Employees,,Unknown / Non-Applicable,155000.0,0.0,data scientist,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Analytics & Behavior Change is an innovation e...,$135000 /yr (est.),10000+ Employees,1963.0,$10+ billion (USD),135000.0,3.1,senior machine learning engineer python ml d...,...,0,0,0,0,0,0,0,0,0,0
763,Morgan Stanley,Machine Learning Researcher,Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,1935.0,$10+ billion (USD),143796.0,4.0,machine learning researcher,...,0,0,0,0,0,0,0,0,0,0
764,MIT Lincoln Laboratory,Machine Learning Software Developer,Laboratory Description\nMIT Lincoln Laboratory...,$117724 /yr (est.),1001 to 5000 Employees,1951.0,Unknown / Non-Applicable,117724.0,4.3,machine learning software developer,...,0,0,0,0,0,0,0,0,0,0
765,Morgan Stanley,Machine Learning Researcher,Machine Learning Researcher\nJob Number:\n3227...,$143796 /yr (est.),10000+ Employees,1935.0,$10+ billion (USD),143796.0,4.0,machine learning researcher,...,0,0,0,0,0,0,0,0,0,0


In [109]:
with open("company_industry.pkl", "wb") as fd:
    pickle.dump(list(df["company_industry"].unique()), fd)


In [16]:
df_encoded.drop(columns=[
 'company',
 'job title',
 'job description',
 'salary estimate',
 'company_size',
 'company_founded',
 'company_revenue',
 'rating',
 'clean_job_title',
 'clean_job_description',
 'programming_languages',
 'skills',
 'seniority',
 'clean_job_description_filtered',], axis=1, inplace=True)


In [18]:
df_encoded.to_csv(filepath + "Encoded_data.csv")
df_encoded

Unnamed: 0,salary,num_of_skills,num_of_programming_languages,seniority_encoded,javascript,shell,python,java,c,c++,...,company_industry_Sporting Goods Stores,company_industry_Sports & Recreation,company_industry_Staffing & Subcontracting,company_industry_State & Regional Agencies,company_industry_Stock Exchanges,company_industry_Taxi & Car Services,company_industry_Telecommunications Services,company_industry_Transportation Equipment Manufacturing,company_industry_Video Game Publishing,company_industry_Wholesale
0,123486.0,36,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,93500.0,21,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,137853.0,15,3,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,72500.0,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,155000.0,20,5,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,135000.0,34,2,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
763,143796.0,17,5,2,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
764,117724.0,16,6,1,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
765,143796.0,17,5,2,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [75]:
df_encoded = df_encoded[df_encoded["salary"] >= 10000]

In [76]:
df_encoded

Unnamed: 0,salary,num_of_skills,num_of_programming_languages,seniority_encoded,javascript,shell,python,java,c,c++,...,company_industry_Sports & Recreation,company_industry_Staffing & Subcontracting,company_industry_State & Regional Agencies,company_industry_Stock Exchanges,company_industry_Taxi & Car Services,company_industry_Telecommunications Services,company_industry_Transportation Equipment Manufacturing,company_industry_Video Game Publishing,company_industry_Wholesale,Salary_category
0,123486.0,36,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,93500.0,21,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,137853.0,15,3,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,72500.0,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,155000.0,20,5,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,135000.0,34,2,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
763,143796.0,17,5,2,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2
764,117724.0,16,6,1,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2
765,143796.0,17,5,2,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2


In [77]:
min_salary = df_encoded.describe()["salary"].loc["min"]
max_salary = df_encoded.describe()["salary"].loc["max"]
range_salary = max_salary - min_salary
breaks = [np.floor(min_salary + (i+1) * range_salary/6) for i in range(6)]

def stratified_salary(x):
    for i in range(6):
        if x <= breaks[i]:
            return i
        

df_encoded["Salary_category"] = df_encoded["salary"].apply(lambda x : stratified_salary(x))



In [78]:
df_encoded

Unnamed: 0,salary,num_of_skills,num_of_programming_languages,seniority_encoded,javascript,shell,python,java,c,c++,...,company_industry_Sports & Recreation,company_industry_Staffing & Subcontracting,company_industry_State & Regional Agencies,company_industry_Stock Exchanges,company_industry_Taxi & Car Services,company_industry_Telecommunications Services,company_industry_Transportation Equipment Manufacturing,company_industry_Video Game Publishing,company_industry_Wholesale,Salary_category
0,123486.0,36,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,93500.0,21,3,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,137853.0,15,3,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,72500.0,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,155000.0,20,5,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,135000.0,34,2,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
763,143796.0,17,5,2,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2
764,117724.0,16,6,1,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2
765,143796.0,17,5,2,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2


In [79]:
X = df_encoded.drop(['salary', "Salary_category"], axis =1)
y = df_encoded['Salary_category']

## Models

In [229]:
# First, split into train and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Then, split the temporary set into test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

Train set shape: (613, 435) (613,)
Test set shape: (77, 435) (77,)
Validation set shape: (77, 435) (77,)


### Model 1: Linear regression

In [202]:
model = LinearRegression()
model.fit(X_train, y_train)


# Train the model
model.fit(X_train, y_train)

y_pred_in_sample = model.predict(X_train)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')


In sample MSE: 382539192.4220538
In sample R2: 0.74274974477614
Out of sample MSE: 3.990340586277854e+26
Out of sample R2: -2.015965141422481e+17


### Model 2: CART

In [237]:
# Create a decision tree regressor
model = DecisionTreeClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

y_pred_in_sample = model.predict(X_train)

# Evaluate the model

# Train the model
model.fit(X_train, y_train)

y_pred_in_sample = model.predict(X_train)

#  Evaluate the model

in_sample_accuracy = accuracy_score(y_train, y_pred_in_sample)

print(f'In sample accuracy: {in_sample_accuracy}')

y_pred_oos = model.predict(X_test)

oos_sample_accuracy = accuracy_score(y_test, y_pred_oos)

print(f'OOS sample accuracy: {oos_sample_accuracy}')

# mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
# print(f'In sample MSE: {mse_in_sample}')

# r2_in_sample = r2_score(y_train, y_pred_in_sample)
# print(f'In sample R2: {r2_in_sample}')

# # Make predictions on the test set
# y_pred_oos = model.predict(X_test)

# # Evaluate the model
# mse_oos = mean_squared_error(y_test, y_pred_oos)
# print(f'Out of sample MSE: {mse_oos}')

# r2_oos = r2_score(y_test, y_pred_oos)
# print(f'Out of sample R2: {r2_oos}')

In sample accuracy: 0.99836867862969
OOS sample accuracy: 0.5064935064935064


### Model 3: Random Forest

In [236]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

y_pred_in_sample = model.predict(X_train)

#  Evaluate the model

in_sample_accuracy = accuracy_score(y_train, y_pred_in_sample)

print(f'In sample accuracy: {in_sample_accuracy}')

y_pred_oos = model.predict(X_test)

oos_sample_accuracy = accuracy_score(y_test, y_pred_oos)

print(f'OOS sample accuracy: {oos_sample_accuracy}')



# mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
# print(f'In sample MSE: {mse_in_sample}')

# r2_in_sample = r2_score(y_train, y_pred_in_sample)
# print(f'In sample R2: {r2_in_sample}')

# # Make predictions on the test set
# y_pred_oos = model.predict(X_test)

# # Evaluate the model
# mse_oos = mean_squared_error(y_test, y_pred_oos)
# print(f'Out of sample MSE: {mse_oos}')

# r2_oos = r2_score(y_test, y_pred_oos)
# print(f'Out of sample R2: {r2_oos}')


In sample accuracy: 0.99836867862969
OOS sample accuracy: 0.6233766233766234


### Model 4: Support Vector Regressor


In [205]:
# Standardize features (important for SVR)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a support vector regression model
model = SVR(kernel='linear', C=1.0)

# Train the model
model.fit(X_train_scaled, y_train)

y_pred_in_sample = model.predict(X_train_scaled)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test_scaled)

# Evaluate the model
mse_oos = mean_squared_error(y_test, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')

In sample MSE: 1410737550.3835034
In sample R2: 0.0513061090753818
Out of sample MSE: 2038474476.8882895
Out of sample R2: -0.029860333530955296


### Model 5: Neural Network


In [261]:
# Build a simple neural network using Keras
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.4, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(6, activation='softmax')  # Use 'softmax' for multiclass classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=60, batch_size=32, validation_split=0.2)


# Evaluate the model on the test set
y_pred_is = model.predict(X_train_scaled)
y_pred_is_classes = y_pred_is.argmax(axis=1)  # Convert softmax probabilities to class labels

is_sample_accuracy = accuracy_score(y_train, y_pred_is_classes)
print(f'In sample sample accuracy: {is_sample_accuracy}')

# Evaluate the model on the test set
y_pred_oos = model.predict(X_test_scaled)
y_pred_oos_classes = y_pred_oos.argmax(axis=1)  # Convert softmax probabilities to class labels

oos_sample_accuracy = accuracy_score(y_test, y_pred_oos_classes)
print(f'OOS sample accuracy: {oos_sample_accuracy}')


# # Train the model
# model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.4)

# y_pred_in_sample = model.predict(X_train_scaled)

# # Evaluate the model
# mse_in_sample = mean_squared_error(y_train, y_pred_in_sample)
# print(f'In sample MSE: {mse_in_sample}')

# r2_in_sample = r2_score(y_train, y_pred_in_sample)
# print(f'In sample R2: {r2_in_sample}')

# # Make predictions on the test set
# y_pred_oos = model.predict(X_test_scaled)

# # Evaluate the model
# mse_oos = mean_squared_error(y_test, y_pred_oos)
# print(f'Out of sample MSE: {mse_oos}')

# r2_oos = r2_score(y_test, y_pred_oos)
# print(f'Out of sample R2: {r2_oos}')


Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
In sample sample accuracy: 0.7217391304347827
OOS sample accuracy: 0.5700325732899023


### Model 6: XGBoost

In [None]:
# Create an XGBoost regressor
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [10, 20, 30, 40, 50],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.1, 0.01, 0.001]
}

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

# model = xgb.XGBRegressor()

# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
# grid_search.fit(X_train_scaled, y_train)


# model = grid_search.best_estimator_

# model.fit(X_train_scaled, y_train)

model = xgb.XGBRegressor(objective='reg:squarederror',
                         n_estimators=40,  # number of trees
                            learning_rate=0.1,  # step size shrinkage to prevent overfitting
                            max_depth=6,  # maximum depth of a tree
                            subsample=0.8,  # fraction of observations to be randomly sampled
                            colsample_bytree=0.8,  # fraction of features to be randomly sampled
                            random_state=42)

# Train the model
model.fit(X_train_scaled, y_train_scaled)

y_pred_in_sample = model.predict(X_train_scaled)

# Evaluate the model
mse_in_sample = mean_squared_error(y_train_scaled, y_pred_in_sample)
print(f'In sample MSE: {mse_in_sample}')

r2_in_sample = r2_score(y_train_scaled, y_pred_in_sample)
print(f'In sample R2: {r2_in_sample}')

# Make predictions on the test set
y_pred_oos = model.predict(X_test_scaled)

# Evaluate the model
mse_oos = mean_squared_error(y_test_scaled, y_pred_oos)
print(f'Out of sample MSE: {mse_oos}')

r2_oos = r2_score(y_test, y_pred_oos)
print(f'Out of sample R2: {r2_oos}')


In sample MSE: 0.039409356515004164
In sample R2: 0.9605906434849958
Out of sample MSE: 0.8826853037090323
Out of sample R2: -7.077724454996494


In [80]:
df_encoded = pd.read_csv("../data/Encoded_data.csv").drop("Unnamed: 0", axis=1)

min_salary = df_encoded.describe()["salary"].loc["min"]
max_salary = df_encoded.describe()["salary"].loc["max"]
range_salary = max_salary - min_salary
breaks = [np.floor(min_salary + (i+1) * range_salary/6) for i in range(6)]

def stratified_salary(x):
    for i in range(6):
        if x <= breaks[i]:
            return i
        

df_encoded["Salary_category"] = df_encoded["salary"].apply(lambda x : stratified_salary(x))


X = df_encoded.drop(["salary", "Salary_category"], axis=1)
y= df_encoded['Salary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [95]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',  # Multiclass classification
    'num_class': len(set(y)),      # Number of classes in your target variable
    'eval_metric': 'merror',       # Multiclass classification error rate
    'max_depth': 10,                # Maximum depth of a tree
    'learning_rate': 0.1,          # Learning rate
    'n_estimators': 100            # Number of boosting rounds
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
y_pred_is = model.predict(dtrain)

# Convert predicted probabilities to class labels for multiclass classification
# If you're working with binary classification, you can skip this step
y_pred_labels_is = [round(value) for value in y_pred_is]

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred_labels_is)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Make predictions on the test set
y_pred = model.predict(dtest)

# Convert predicted probabilities to class labels for multiclass classification
# If you're working with binary classification, you can skip this step
y_pred_labels = [round(value) for value in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.84%
Accuracy: 59.87%


In [96]:
import pickle 

with open("Salary_classifier.pkl", "wb") as f:
    pickle.dump(model, f)

In [97]:
breaks, min_salary

([57844.0, 105675.0, 153506.0, 201337.0, 249168.0, 297000.0], 10013.0)

In [99]:
def salary_range(i):
    if i == 0:
        return "10,000 - 57,844"
    if i == 1:
        return "57,844 - 105,675"
    if i == 2:
        return "105,675 - 153,506"
    if i == 3: 
        return "153,506 - 201,337"
    if i == 4:
        return "201,337 - 249168"
    return "249168 - 297,000"


In [71]:
# with open("Salary_classifier.pkl", "rb") as fd:
#     model = pickle.load(fd)
d_test = xgb.DMatrix(X_test.iloc[[0]])

model.predict(d_test)

array([1.], dtype=float32)

In [102]:

print(X_train.columns)


Index(['num_of_skills', 'num_of_programming_languages', 'seniority_encoded',
       'javascript', 'shell', 'python', 'java', 'c', 'c++', 'go',
       ...
       'company_industry_Sporting Goods Stores',
       'company_industry_Sports & Recreation',
       'company_industry_Staffing & Subcontracting',
       'company_industry_State & Regional Agencies',
       'company_industry_Stock Exchanges',
       'company_industry_Taxi & Car Services',
       'company_industry_Telecommunications Services',
       'company_industry_Transportation Equipment Manufacturing',
       'company_industry_Video Game Publishing', 'company_industry_Wholesale'],
      dtype='object', length=435)


In [2]:
import pickle
with open("input_data.pkl", "rb") as fd:
    input_data = pickle.load(fd)
input_data

{'skills': ['data analytics',
  'deep learning',
  'excel',
  'languages',
  'machine learning'],
 'prog_lang': ['python', 'java', 'sql'],
 'seniority': 'Junior',
 'location': 'Redmond, WA',
 'company_type': 'Company - Public',
 'job_category': 'data scientist',
 'company_sector': 'Information Technology',
 'company_industry': 'Computer Hardware Development',
 'size': '1 to 50 Employees'}

In [119]:
df = pd.DataFrame([input_data])

df["prog_langs"] = df["prog_lang"].apply(lambda x: x.strip().split(","))
df["skills"] = df["skills"].apply(lambda x: x.strip().split(","))
df


Unnamed: 0,skills,prog_lang,seniority,location,company_type,job_category,company_sector,company_industry,size,prog_langs
0,[analytics],java,Junior,"Redmond, WA",Company - Public,data scientist,Nonprofit & NGO,Computer Hardware Development,1 to 50 Employees,[java]


In [118]:
with open("skills_final.pkl", "rb") as fd:
    skills_final = pickle.load(fd)

with open("prog_lang.pkl", "rb") as fd:
    prog_lang_final = pickle.load(fd)

In [132]:
cols = list(X_train.columns)

input_data = {}
for col in cols:
    input_data[col] = 0

if df["seniority"].values[0] == "Junior":
    input_data["seniority_encoded"] = 0
elif df["seniority"].values[0] == "Mid":
    input_data["seniority_encoded"] = 1
else: 
    input_data["seniority_encoded"] = 2

input_data["location_" + df["location"].values[0]] = 1
input_data["job_category_" + df["job_category"].values[0]] = 1
input_data["company_sector_" + df["company_sector"].values[0]] = 1
input_data["company_industry_" + df["company_industry"].values[0]] = 1

size_list_reordered = [
    '1 to 50 Employees',
    '51 to 200 Employees',
    '201 to 500 Employees',
    '501 to 1000 Employees',
    '1001 to 5000 Employees',
    '5001 to 10000 Employees',
    '10000+ Employees',
    'Unknown'
]

revenue_list_reordered = [
    'Less than $1 million (USD)',
    '$1 to $5 million (USD)',
    '$5 to $25 million (USD)',
    '$25 to $100 million (USD)',
    '$100 to $500 million (USD)',
    '$500 million to $1 billion (USD)',
    '$1 to $5 billion (USD)',
    '$5 to $10 billion (USD)',
    '$10+ billion (USD)',
    'Unknown / Non-Applicable'
]


for i, size in enumerate(size_list_reordered):
    if df["size"].values[0] == size:
        input_data["company_size_encoded"] = i

for i, size in enumerate(revenue_list_reordered):
    if df["size"].values[0] == size:
        input_data["company_revenue_encoded"] = i

for lang in df["prog_langs"].values[0]:
    input_data[lang] = 1

for skill in df["skills"].values[0]:
    input_data[skill] = 1

input_data['num_of_programming_languages'] = len(df["prog_langs"].values[0])
input_data['num_of_skills'] = len(df["skills"].values[0])





# list(X_train.columns)

Unnamed: 0,num_of_skills,num_of_programming_languages,seniority_encoded,javascript,shell,python,java,c,c++,go,...,company_industry_Sporting Goods Stores,company_industry_Sports & Recreation,company_industry_Staffing & Subcontracting,company_industry_State & Regional Agencies,company_industry_Stock Exchanges,company_industry_Taxi & Car Services,company_industry_Telecommunications Services,company_industry_Transportation Equipment Manufacturing,company_industry_Video Game Publishing,company_industry_Wholesale
0,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
pd.DataFrame([input_data])

Unnamed: 0,num_of_skills,num_of_programming_languages,seniority_encoded,javascript,shell,python,java,c,c++,go,...,company_industry_Sporting Goods Stores,company_industry_Sports & Recreation,company_industry_Staffing & Subcontracting,company_industry_State & Regional Agencies,company_industry_Stock Exchanges,company_industry_Taxi & Car Services,company_industry_Telecommunications Services,company_industry_Transportation Equipment Manufacturing,company_industry_Video Game Publishing,company_industry_Wholesale
0,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:

with open("columns.pkl", "wb") as f:
    pickle.dump(list(X_train.columns), f)