---
title: "Multiple Linear Regression - Salary Predition"
subtitle: ""

bibliography: references.bib
csl: csl/econometrica.csl
format: 
  html:
    toc: true
    number-sections: true
    df-print: paged
    code: false
    code-tools: true
    section-divs: true
---




# Mutiple Linear Regression


In [None]:
#| echo: false
import os, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import plotly.graph_objects as go

In [None]:
#| echo: false
df = pd.read_csv('files/cleaned_job_postings.csv')

In [None]:
#| echo: false
df.head()

In [None]:
#| echo: false
#df.columns

In [None]:
df["exp_mid"] = df[["MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE"]].mean(axis=1)

df["skill_count"] = df["SPECIALIZED_SKILLS_NAME"].fillna("").str.count(",") + 1

df["has_python"] = df["SPECIALIZED_SKILLS_NAME"].str.contains("Python", case=False, na=False).astype(int)

df["edu_ge_bachelors"] = df["MIN_EDULEVELS_NAME"].isin(
    ["Bachelor's Degree", "Master's Degree", "Doctoral Degree"]
).astype(int)

keep_num  = ["exp_mid", "MODELED_DURATION", "skill_count",
             "has_python", "edu_ge_bachelors"]

keep_cat  = ["EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
             "STATE_NAME", 
             "SOC_2021_4_NAME"]

df_model = (
    df.dropna(subset=["SALARY"])      
      .loc[:, keep_num + keep_cat + ["SALARY"]]  
)

In [None]:
df_dummies = pd.get_dummies(
    df_model,
    columns = keep_cat,   
    drop_first = True,   
    dtype = float        
)

In [None]:
#| echo: false
print(df_dummies.shape)
print(df_dummies.dtypes.head(10))

In [None]:
# Drop salary to form features
X = df_dummies.drop('SALARY', axis = 1)
y = df_dummies['SALARY']

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.3, random_state = 688)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
pd.Series(y_pred).describe()

In [None]:
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.4f}")

In [None]:
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", ascending=False)

coef_df.head(10)

In [None]:
coef_cleaned = coef_df[~coef_df['Feature'].str.contains("Unknown|\[None\]", regex=True)]

coef_cleaned.head(10)

# Random Forest


In [None]:
lm  = LinearRegression().fit(X_train, y_train)

In [None]:
rf   = RandomForestRegressor(n_estimators=300, random_state=688).fit(X_train, y_train)

# Visualization 


In [None]:
#| echo: false
figures_folder = "figures"
if not os.path.exists(figures_folder):
    os.makedirs(figures_folder)

In [None]:
# 1. Coefficient bar chart
#coef = pd.Series(lm.coef_, index=X_train.columns).sort_values()
#fig = coef.tail(15).plot(kind="barh", figsize=(6,5))
#plt.title("Top 15 Positive MLR Coefficients")
#plt.tight_layout()

#fig.write_html(os.path.join(figures_folder, "MLR_Coefficients.html"))

# Unsupervised Learning - Kmeans 


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_dummies)

In [None]:
wcss = []
K_range = range(2, 11)

for k in K_range:
    km = KMeans(n_clusters=k, n_init=10, random_state=688)
    km.fit(X_scaled)
    wcss.append(km.inertia_)

In [None]:
# Elbow Plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(K_range),
    y=wcss,
    mode='lines+markers',
    marker=dict(size=8),
    line=dict(width=2),
    name='WCSS'
))

fig.update_layout(
    title='Elbow Method - Within-Cluster Sum of Squares',
    xaxis_title='Number of Clusters (k)',
    yaxis_title='WCSS',
    template='plotly_white',
    width=800,
    height=500
)

fig.write_html(os.path.join(figures_folder, "Elbow_Chart.html"))

<iframe src="figures/Elbow_Chart.html" width="100%" height="500"></iframe>


In [None]:
# From the elbow Plot, we take k = 4
kmeans = KMeans(n_clusters=4, n_init=10, random_state=688)

cluster_labels = kmeans.fit_predict(X_scaled)

df['cluster'] = cluster_labels

In [None]:
print(df['cluster'].value_counts().sort_index())