# Feature Engineering

First I will clean up the dataset to get rid of useless information

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("../data/processed/ai_job_dataset_combined.csv")

In [4]:
df_model = df.copy()

columns_to_drop = [
    'job_id',
    'salary_currency',
    'company_location',
    'employee_residence',
    'posting_date',
    'application_deadline',
    'job_description_length',
    'benefits_score',
    'company_name',
    'required_skills',
    'employment_type',
    'remote_ratio'
]

df_model.drop(columns=columns_to_drop, inplace=True, errors='ignore')
df_model.columns


Index(['job_title', 'salary_usd', 'experience_level', 'company_size',
       'education_required', 'years_experience', 'industry'],
      dtype='object')

In [5]:
output_path = '../data/processed/ai_job_cleaned.csv'
df_model.to_csv(output_path, index=False)

print(f"Saved combined dataset to: {output_path}")

Saved combined dataset to: ../data/processed/ai_job_cleaned.csv


# Ordinal Encoding 
Will encode the ordered categories

In [6]:
df_model['experience_level'].unique()

experience_order = {'EN': 0, 'MI': 1, 'SE': 2, 'EX': 3}
df_model['experience_level'] = df_model['experience_level'].map(experience_order)



In [7]:
df_model['experience_level'].unique()
display(df_model.head())

Unnamed: 0,job_title,salary_usd,experience_level,company_size,education_required,years_experience,industry
0,AI Research Scientist,90376,2,M,Bachelor,9,Automotive
1,AI Software Engineer,61895,0,M,Master,1,Media
2,AI Specialist,152626,1,L,Associate,2,Education
3,NLP Engineer,80215,2,M,PhD,7,Consulting
4,AI Consultant,54624,0,S,Master,0,Media


In [8]:
df_model['company_size'].unique()

array(['M', 'L', 'S'], dtype=object)

In [9]:
size_order = {'S' : 0, 'M' : 1, 'L' : 2}
df_model['company_size'] = df_model['company_size'].map(size_order)

In [10]:
df_model['company_size'].unique()

array([1, 2, 0])

In [11]:
display(df_model.head())

Unnamed: 0,job_title,salary_usd,experience_level,company_size,education_required,years_experience,industry
0,AI Research Scientist,90376,2,1,Bachelor,9,Automotive
1,AI Software Engineer,61895,0,1,Master,1,Media
2,AI Specialist,152626,1,2,Associate,2,Education
3,NLP Engineer,80215,2,1,PhD,7,Consulting
4,AI Consultant,54624,0,0,Master,0,Media


In [12]:
df_model['education_required'].unique()

array(['Bachelor', 'Master', 'Associate', 'PhD'], dtype=object)

In [13]:
education_order = {'Associate' : 0, 'Bachelor' : 1, 'Master' : 2, 'PhD' : 3}
df_model['education_required'] = df_model['education_required'].map(education_order)

In [14]:
df_model['education_required'].unique()

array([1, 2, 0, 3])

In [15]:
display(df_model.head())

Unnamed: 0,job_title,salary_usd,experience_level,company_size,education_required,years_experience,industry
0,AI Research Scientist,90376,2,1,1,9,Automotive
1,AI Software Engineer,61895,0,1,2,1,Media
2,AI Specialist,152626,1,2,0,2,Education
3,NLP Engineer,80215,2,1,3,7,Consulting
4,AI Consultant,54624,0,0,2,0,Media


In [16]:
industry_dummies = pd.get_dummies(df_model['industry'], prefix='industry')


In [17]:
df_model = pd.concat([df_model, industry_dummies], axis=1)


In [18]:
df_model.drop(columns='industry', inplace=True)

In [19]:
display(df_model.head())

Unnamed: 0,job_title,salary_usd,experience_level,company_size,education_required,years_experience,industry_Automotive,industry_Consulting,industry_Education,industry_Energy,...,industry_Gaming,industry_Government,industry_Healthcare,industry_Manufacturing,industry_Media,industry_Real Estate,industry_Retail,industry_Technology,industry_Telecommunications,industry_Transportation
0,AI Research Scientist,90376,2,1,1,9,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,AI Software Engineer,61895,0,1,2,1,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,AI Specialist,152626,1,2,0,2,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,NLP Engineer,80215,2,1,3,7,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,AI Consultant,54624,0,0,2,0,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [20]:
job_dummies = pd.get_dummies(df_model['job_title'], prefix='title')


In [21]:
df_model = pd.concat([df_model, job_dummies], axis=1)


In [93]:
df_model.drop(columns='job_title', inplace=True)


In [22]:
display(df_model.head())

Unnamed: 0,job_title,salary_usd,experience_level,company_size,education_required,years_experience,industry_Automotive,industry_Consulting,industry_Education,industry_Energy,...,title_Data Scientist,title_Deep Learning Engineer,title_Head of AI,title_ML Ops Engineer,title_Machine Learning Engineer,title_Machine Learning Researcher,title_NLP Engineer,title_Principal Data Scientist,title_Research Scientist,title_Robotics Engineer
0,AI Research Scientist,90376,2,1,1,9,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,AI Software Engineer,61895,0,1,2,1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,AI Specialist,152626,1,2,0,2,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,NLP Engineer,80215,2,1,3,7,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,AI Consultant,54624,0,0,2,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
output_path = '../data/processed/ai_job_model_ready.csv'
df_model.to_csv(output_path, index=False)

print(f"Saved combined dataset to: {output_path}")

Saved combined dataset to: ../data/processed/ai_job_model_ready.csv


# Everything is Encoded

In [24]:
from sklearn.preprocessing import StandardScaler


In [25]:
scaler = StandardScaler()
columns_to_scale = ['years_experience', 'salary_usd']


In [26]:
df_scaled = df_model.copy()
df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])


In [27]:
display(df_scaled.head())

Unnamed: 0,job_title,salary_usd,experience_level,company_size,education_required,years_experience,industry_Automotive,industry_Consulting,industry_Education,industry_Energy,...,title_Data Scientist,title_Deep Learning Engineer,title_Head of AI,title_ML Ops Engineer,title_Machine Learning Engineer,title_Machine Learning Researcher,title_NLP Engineer,title_Principal Data Scientist,title_Research Scientist,title_Robotics Engineer
0,AI Research Scientist,-0.454683,2,1,1,0.482845,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,AI Software Engineer,-0.912364,0,1,2,-0.952823,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,AI Specialist,0.545655,1,2,0,-0.773364,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,NLP Engineer,-0.617967,2,1,3,0.123928,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,AI Consultant,-1.029207,0,0,2,-1.132281,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [28]:
output_path = '../data/processed/ai_job_model_scaled.csv'
df_scaled.to_csv(output_path, index=False)

print(f"Saved combined dataset to: {output_path}")

Saved combined dataset to: ../data/processed/ai_job_model_scaled.csv


# Train / Test Split
Will now be seperating into training & testing sets

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X = df_scaled.drop('salary_usd', axis=1)
y = df_scaled['salary_usd']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (24000, 40)
X_test shape: (6000, 40)
y_train shape: (24000,)
y_test shape: (6000,)


In [33]:
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)


In [36]:
import joblib

joblib.dump(scaler, '../models/salary_scaler.pkl')

['../models/salary_scaler.pkl']