# **Stacking Regressor**

## **Importing Libararies**

In [180]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRFRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

## **Data Overview**

In [181]:
df = pd.read_csv('Dataset salary 2024.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024,SE,FT,AI Engineer,202730,USD,202730,US,0,US,M
1,2024,SE,FT,AI Engineer,92118,USD,92118,US,0,US,M
2,2024,SE,FT,Data Engineer,130500,USD,130500,US,0,US,M
3,2024,SE,FT,Data Engineer,96000,USD,96000,US,0,US,M
4,2024,SE,FT,Machine Learning Engineer,190000,USD,190000,US,0,US,M


In [182]:
df.shape

(16534, 11)

In [183]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [184]:
df.describe(include='all')

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
count,16534.0,16534,16534,16534,16534.0,16534,16534.0,16534,16534.0,16534,16534
unique,,4,4,155,,23,,88,,77,3
top,,SE,FT,Data Engineer,,USD,,US,,US,M
freq,,10670,16454,3464,,15294,,14467,,14518,15306
mean,2023.226866,,,,163727.0,,149686.777973,,32.00375,,
std,0.713558,,,,340205.7,,68505.293156,,46.245158,,
min,2020.0,,,,14000.0,,15000.0,,0.0,,
25%,2023.0,,,,101763.0,,101125.0,,0.0,,
50%,2023.0,,,,142200.0,,141300.0,,0.0,,
75%,2024.0,,,,187200.0,,185900.0,,100.0,,


In [185]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16534 entries, 0 to 16533
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           16534 non-null  int64 
 1   experience_level    16534 non-null  object
 2   employment_type     16534 non-null  object
 3   job_title           16534 non-null  object
 4   salary              16534 non-null  int64 
 5   salary_currency     16534 non-null  object
 6   salary_in_usd       16534 non-null  int64 
 7   employee_residence  16534 non-null  object
 8   remote_ratio        16534 non-null  int64 
 9   company_location    16534 non-null  object
 10  company_size        16534 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.4+ MB


In [186]:
df['salary'].equals(df['salary_in_usd'])

False

## **Train Test Split**

In [187]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['salary', 
                                                                    'salary_in_usd', 'salary_currency']), 
                                                    df['salary'],
                                                    test_size=0.3,
                                                    random_state=42)

In [188]:
X_train.head(1)

Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size
14159,2023,MI,FT,Data Scientist,US,100,US,M


## **Feature Engineering**

In [189]:
# numerical_columns = [0, 6]
# ordinal_columns = [8]
# nominal_columns = [1, 2, 3, 4, 5, 7]

In [201]:
df['experience_level'].value_counts()

experience_level
SE    10670
MI     4038
EN     1325
EX      501
Name: count, dtype: int64

In [190]:
numerical_columns = ['work_year', 'remote_ratio']
ordinal_columns = ['experience_level', 'company_size']
nominal_columns = ['employment_type', 'job_title', 'employee_residence', 'company_location']

In [191]:
handle_numerical = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

In [192]:
handle_ordinal = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder())
])

In [193]:
handle_nominal = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

In [194]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', handle_numerical, numerical_columns),
    ('ordinal', handle_ordinal, ordinal_columns),
    ('nominal', handle_nominal, nominal_columns)
], remainder='passthrough')

## **Modeling and Evaluation**

In [195]:
base_models = [
    ('LinearRegression', make_pipeline(preprocessing, LinearRegression())),
    ('RandomForestRegressor', make_pipeline(preprocessing, RandomForestRegressor())),
    ('DecisionTreeRegressor', make_pipeline(preprocessing, DecisionTreeRegressor())),
    ('GradientBoostingRegressor', make_pipeline(preprocessing, GradientBoostingRegressor()))
]

In [196]:
meta_model = XGBRFRegressor()

In [197]:
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

In [198]:
stacking_regressor.fit(X_train, y_train)



In [199]:
y_pred = stacking_regressor.predict(X_test)



In [200]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R2 Score: {r2}")

Mean Absolute Error (MAE): 57122.4019004485
Mean Squared Error (MSE): 81223055133.11938
R2 Score: 0.1286662220954895
