In [67]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.model_selection import KFold , train_test_split , cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error , mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator , TransformerMixin
df = pd.read_csv('salaries.csv')
df = df.drop(df[["salary_in_usd" , 'employee_residence' , 'remote_ratio' , 'employment_type' , 'salary_currency']] , axis = 1)
target_encode_cols = ['company_location' , 'job_title']
ordinal_columns = ['experience_level' , 'company_size']
ordinal_mapping = {
    "experience_level":{
        "SE":0,
        "MI":1,
        "EN":2,
        "EX":4
    },
    "company_size":{
        "S":0,
        "M":1,
        "L":2
    }
}
ordinal_cols = [
    'experience_level',
    'company_size'
]
class OrdinalMapper(BaseEstimator , TransformerMixin):
    def __init__(self , mappings):
        self.mappings = mappings
    def fit(self , X , y = None):
        return self
    def transform(self , X):
        X = X.copy()
        for col ,  mappings in self.mappings.items():
            X[col] = X[col].map(mappings)
        return X
preprocessor = ColumnTransformer(
    transformers=[
        (
            "target_enc",
            ce.TargetEncoder(smoothing=10),
            target_encode_cols
        ),
        (
            "ordinal",
            OrdinalMapper(ordinal_mapping),
            ordinal_cols
        )
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)
model = RandomForestRegressor(
    n_estimators = 300,
    max_depth = None,
    random_state = 42,
    n_jobs = -1
)
pipeline = Pipeline(
    steps = [
        ("preprocess" , preprocessor),
        ('model' , model )
    ]
)
X = df.drop(columns = ['salary'])
y = np.log(df['salary']) 
X_train , X_test , y_train , y_test = train_test_split(
    X , y , random_state = 42 , test_size = 0.2
)
pipeline.fit(X_train , y_train)
pred = pipeline.predict(X_test)
Xt = preprocessor.fit_transform(X_train, y_train)
print(Xt.dtype)
cv = KFold(n_splits = 5 , shuffle = True , random_state = 42)
cv_mae = cross_val_score(
    pipeline,
    X,
    y,
    scoring = "neg_mean_absolute_error",
    cv = cv,
    n_jobs = -1
)
new_data = pd.DataFrame([{
    "company_location": "IN",
    "experience_level": "SE",
    "company_size": "M",
    "job_title": 'Lead AI Engineer',
    "work_year": 2025,
}])
og_pred = pipeline.predict(X_test)
log_salary = pipeline.predict(new_data)
salary_predict = np.exp(log_salary)
mae = mean_absolute_error(y_test , og_pred)
mse = mean_squared_error(y_test , og_pred)
rmse = np.sqrt(mse)
print("The Predicted Salary is :" , int(salary_predict[0]))
print(f"The mae is:{mae}\nThe rmse is:{rmse}")

float64
The Predicted Salary is : 65128
The mae is:0.2940907528549742
The rmse is:0.3833235730824941
