In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

from xgboost import XGBRegressor


In [3]:
df=pd.read_csv('it_job_descriptions_cleaned.csv')
df=df.drop(columns=['combined','gender'])

In [4]:
df.head()

Unnamed: 0,experience,qualification,work_type,job_title,role,job_description,skills,responsibilities,salary_avg
0,4.0,B.Tech,Contract,Data Analyst,Data Scientist,Data Scientists use their expertise in data an...,Machine learning algorithms and libraries (e.g...,Apply machine learning algorithms and statisti...,87000.0
1,5.0,MCA,Temporary,Data Analyst,Data Scientist,Data Scientists use their expertise in data an...,Machine learning algorithms and libraries (e.g...,Apply machine learning algorithms and statisti...,90500.0
2,3.0,BBA,Contract,Data Analyst,Business Intelligence Analyst,Business Intelligence Analysts gather and anal...,"Data analysis and visualization tools (e.g., T...",Gather and analyze data to provide insights fo...,81000.0
3,4.0,BA,Full-Time,Data Analyst,Data Scientist,Data Scientists use their expertise in data an...,Machine learning algorithms and libraries (e.g...,Apply machine learning algorithms and statisti...,88000.0
4,3.0,MBA,Intern,Data Analyst,Business Intelligence Analyst,Business Intelligence Analysts gather and anal...,"Data analysis and visualization tools (e.g., T...",Gather and analyze data to provide insights fo...,95500.0


In [5]:
df.describe()

Unnamed: 0,experience,salary_avg
count,101397.0,101397.0
mean,2.503881,82482.129649
std,1.707419,7515.167785
min,0.0,67500.0
25%,1.0,76000.0
50%,3.0,82500.0
75%,4.0,89000.0
max,5.0,97500.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101397 entries, 0 to 101396
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   experience        101397 non-null  float64
 1   qualification     101397 non-null  object 
 2   work_type         101397 non-null  object 
 3   job_title         101397 non-null  object 
 4   role              101397 non-null  object 
 5   job_description   101397 non-null  object 
 6   skills            101397 non-null  object 
 7   responsibilities  101397 non-null  object 
 8   salary_avg        101397 non-null  float64
dtypes: float64(2), object(7)
memory usage: 7.0+ MB


In [7]:
df["salary_avg"] = np.log1p(df["salary_avg"])
df["text"] = (
    df["job_description"].fillna("") + " " +
    df["skills"].fillna("") + " " +
    df["responsibilities"].fillna("")
)

In [8]:
numerical_cols = ["experience"]

categorical_cols = [
    "qualification",
    "work_type",
    "job_title",
    "role"
]

target = "salary_avg"


In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    df, df[target],
    test_size=0.2,
    random_state=42
)


In [10]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

X_train_cat = ohe.fit_transform(X_train[categorical_cols])
X_val_cat = ohe.transform(X_val[categorical_cols])


In [11]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_text = tfidf.fit_transform(X_train["text"])
X_val_text = tfidf.transform(X_val["text"])


In [12]:
X_train_num = X_train[numerical_cols].values
X_val_num = X_val[numerical_cols].values


In [13]:
from scipy.sparse import csr_matrix

X_train_num = csr_matrix(X_train_num)
X_val_num = csr_matrix(X_val_num)


In [14]:
X_train_final = hstack([X_train_num, X_train_cat, X_train_text])
X_val_final = hstack([X_val_num, X_val_cat, X_val_text])


In [15]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_final, label=y_train)
dval = xgb.DMatrix(X_val_final, label=y_val)

params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "eval_metric": "rmse"
}

evals = [(dtrain, "train"), (dval, "eval")]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=True
)

# Predict
preds = bst.predict(dval)


[0]	train-rmse:0.09158	eval-rmse:0.09191
[1]	train-rmse:0.09157	eval-rmse:0.09191
[2]	train-rmse:0.09156	eval-rmse:0.09191
[3]	train-rmse:0.09155	eval-rmse:0.09191
[4]	train-rmse:0.09153	eval-rmse:0.09191
[5]	train-rmse:0.09152	eval-rmse:0.09191
[6]	train-rmse:0.09152	eval-rmse:0.09191
[7]	train-rmse:0.09151	eval-rmse:0.09191
[8]	train-rmse:0.09150	eval-rmse:0.09192
[9]	train-rmse:0.09149	eval-rmse:0.09192
[10]	train-rmse:0.09148	eval-rmse:0.09191
[11]	train-rmse:0.09147	eval-rmse:0.09192
[12]	train-rmse:0.09146	eval-rmse:0.09192
[13]	train-rmse:0.09145	eval-rmse:0.09192
[14]	train-rmse:0.09144	eval-rmse:0.09192
[15]	train-rmse:0.09143	eval-rmse:0.09192
[16]	train-rmse:0.09142	eval-rmse:0.09192
[17]	train-rmse:0.09141	eval-rmse:0.09192
[18]	train-rmse:0.09141	eval-rmse:0.09192
[19]	train-rmse:0.09140	eval-rmse:0.09192
[20]	train-rmse:0.09139	eval-rmse:0.09192
[21]	train-rmse:0.09138	eval-rmse:0.09193
[22]	train-rmse:0.09137	eval-rmse:0.09193
[23]	train-rmse:0.09136	eval-rmse:0.09194
[2

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Reverse log if you used log1p
preds_salary = np.expm1(preds)
y_val_salary = np.expm1(y_val)

# Metrics
rmse = np.sqrt(mean_squared_error(y_val_salary, preds_salary))
mae = mean_absolute_error(y_val_salary, preds_salary)
r2 = r2_score(y_val_salary, preds_salary)
mape = np.mean(np.abs((y_val_salary - preds_salary) / y_val_salary)) * 100

print("Evaluation Metrics:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R^2 Score: {r2:.4f}")
print(f"MAPE: {mape:.2f}%")


Evaluation Metrics:
RMSE: 7546.43
MAE: 6492.49
R^2 Score: -0.0033
MAPE: 7.94%


In [17]:
new_record = {
    "experience": 4.0,
    "qualification": "B.Tech",
    "work_type": "Contract",
    "job_title": "Data Analyst",
    "role": "Data Scientist",
    "job_description": "Data Scientists use their expertise in data and analytics to generate insights and predictions.",
    "skills": "Machine learning algorithms and libraries (e.g., scikit-learn, TensorFlow, PyTorch)",
    "responsibilities": "Apply machine learning algorithms and statistical models to solve business problems."
}
df_new = pd.DataFrame([new_record])
from scipy.sparse import csr_matrix, hstack
X_num = csr_matrix(df_new[["experience"]].values)
X_cat = ohe.transform(df_new[categorical_cols])
text_combined = df_new["job_description"] + " " + df_new["skills"] + " " + df_new["responsibilities"]
X_text = tfidf.transform(text_combined)
X_final = hstack([X_num, X_cat, X_text])
dnew = xgb.DMatrix(X_final)
pred_log = bst.predict(dnew)
pred_salary = np.expm1(pred_log)  
print(f"Predicted Salary: {pred_salary[0]:.2f}")


Predicted Salary: 82147.91


In [19]:
# Save model in XGBoost native format
bst.save_model("xgb_salary_model.json")

import joblib

joblib.dump(ohe, "ohe.pkl")
joblib.dump(tfidf, "tfidf.pkl")
joblib.dump(categorical_cols, "categorical_cols.pkl")


['categorical_cols.pkl']