In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

RANDOM_STATE = 42


In [6]:
from pathlib import Path

DATA_PATH = Path("..") / "data" / "raw" / "salaries.csv"
df = pd.read_csv(DATA_PATH)

df.head()


FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\raw\\salaries.csv'

In [None]:
print("Shape:", df.shape)
display(df.sample(5, random_state=RANDOM_STATE))
df.info()
df.describe(include="all").T


In [None]:
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)

df.columns


In [None]:
TARGET = "salary_in_usd"

df = df.dropna(subset=[TARGET])

df = df.drop(columns=["salary", "salary_currency"], errors="ignore")

print("Shape after basic cleanup:", df.shape)


In [None]:
missing = df.isna().sum().sort_values(ascending=False)
missing[missing > 0]


In [None]:
duplicates = df.duplicated().sum()
duplicates


In [None]:
df = df.drop_duplicates()


In [None]:
experience_map = {
    "EN": 0,
    "MI": 1,
    "SE": 2,
    "EX": 3
}

df["experience_level_encoded"] = df["experience_level"].map(experience_map)


In [None]:
lower, upper = df[TARGET].quantile([0.01, 0.99])
df[TARGET] = df[TARGET].clip(lower, upper)


In [None]:
plt.figure()
sns.histplot(df[TARGET], bins=50, kde=True)
plt.title("Distribution of Salary (USD)")
plt.xlabel("Salary (USD)")
plt.ylabel("Count")
plt.show()


In [None]:
plt.figure()
sns.boxplot(
    x="experience_level",
    y=TARGET,
    data=df,
    order=["EN", "MI", "SE", "EX"]
)
plt.title("Salary by Experience Level")
plt.xlabel("Experience Level")
plt.ylabel("Salary (USD)")
plt.show()


In [None]:
plt.figure()
sns.boxplot(
    x="company_size",
    y=TARGET,
    data=df,
    order=["S", "M", "L"]
)
plt.title("Salary by Company Size")
plt.xlabel("Company Size")
plt.ylabel("Salary (USD)")
plt.show()


In [None]:
plt.figure()
sns.boxplot(
    x="remote_ratio",
    y=TARGET,
    data=df
)
plt.title("Salary vs Remote Ratio")
plt.xlabel("Remote Ratio (%)")
plt.ylabel("Salary (USD)")
plt.show()


In [None]:
top_titles = df["job_title"].value_counts().head(10).index

plt.figure()
sns.boxplot(
    y="job_title",
    x=TARGET,
    data=df[df["job_title"].isin(top_titles)]
)
plt.title("Salary Distribution for Top 10 Job Titles")
plt.xlabel("Salary (USD)")
plt.ylabel("Job Title")
plt.show()


In [None]:
features = [
    "experience_level",
    "employment_type",
    "job_title",
    "company_size",
    "remote_ratio",
    "company_location"
]

X = df[features]
y = df[TARGET]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)


In [None]:
categorical_features = X.select_dtypes(include="object").columns
numeric_features = X.select_dtypes(exclude="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ]
)


In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

rmse, mae, r2


In [None]:
plt.figure()
sns.scatterplot(x=y_test, y=y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         linestyle="--")
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs Predicted Salary")
plt.show()
