---
title: "ML Methods"
author:
  - name: Mahira Ayub
    affiliations:
      - id: bu
        name: Boston University
        city: Boston
        state: MA
  - name: Ava Godsy
    affiliations:
      - ref: bu
  - name: Joshua Lawrence
    affiliations:
      - ref: bu
date: today
format: 
  html:
    theme: minty
    bibliography: references.bib
    csl: csl/econometrica.csl
    toc: true
---

In [2]:
import pandas as pd
df_clean = pd.read_csv("lightcast_job_postings.csv")


  df_clean = pd.read_csv("lightcast_job_postings.csv")


In [5]:
df_clean.columns


Index(['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES',
       'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL',
       ...
       'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3',
       'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME',
       'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6',
       'NAICS_2022_6_NAME'],
      dtype='object', length=131)

In [7]:
# Show all columns that contain the word "state" (case-insensitive)
[col for col in df_clean.columns if 'state' in col.lower()]


['STATE', 'STATE_NAME']

In [11]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import plotly.graph_objects as go

# --------------------------------------------------
# 1. Load and clean the dataset
# --------------------------------------------------
df_clean = pd.read_csv("lightcast_job_postings.csv")  # adjust path if needed

# Convert salary-related columns to numeric (replace with actual salary column name)
# Try to find which column represents salary (SALARY, SALARY_FROM, AVERAGE_SALARY, etc.)
salary_cols = [col for col in df_clean.columns if 'salary' in col.lower()]
print("Salary-related columns found:", salary_cols)

# Choose one salary column
salary_col = salary_cols[0]  # use the first match (adjust manually if needed)

df_clean[salary_col] = pd.to_numeric(df_clean[salary_col], errors='coerce')

# Drop rows where salary or state name is missing
df_clean = df_clean.dropna(subset=[salary_col, 'STATE_NAME'])

# --------------------------------------------------
# 2. Compute average salary per state
# --------------------------------------------------
state_salary = df_clean.groupby('STATE_NAME')[salary_col].mean().reset_index()
state_salary.rename(columns={salary_col: 'AVERAGE_SALARY'}, inplace=True)

# --------------------------------------------------
# 3. Prepare data for clustering
# --------------------------------------------------
X = state_salary[['AVERAGE_SALARY']].copy()
# Add a numeric index for plotting purposes
state_salary['STATE_INDEX'] = np.arange(len(state_salary))

# --------------------------------------------------
# 4. Run KMeans clustering
# --------------------------------------------------
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
state_salary['CLUSTER'] = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_

# --------------------------------------------------
# 5. Visualization with Plotly
# --------------------------------------------------
colors = ['#3D7C6A', '#B14E53', '#297C8A']

fig = go.Figure()

# Add points for each cluster
for i in range(3):
    cluster_data = state_salary[state_salary['CLUSTER'] == i]
    fig.add_trace(go.Scatter(
        x=cluster_data['STATE_INDEX'],
        y=cluster_data['AVERAGE_SALARY'],
        mode='markers+text',
        name=f'Cluster {i+1}',
        text=cluster_data['STATE_NAME'],
        textposition="top center",
        marker=dict(color=colors[i], size=10, opacity=0.7),
        hovertemplate=(
            'State: %{text}<br>'
            'Average Salary: $%{y:,.0f}<br>'
            'Cluster: ' + str(i+1) + '<extra></extra>'
        )
    ))

# Add centroid marker
fig.add_trace(go.Scatter(
    x=np.arange(3),
    y=centroids.flatten(),
    mode='markers',
    name='Centroids',
    marker=dict(color='black', size=15, symbol='x', line=dict(width=2)),
    hovertemplate='Centroid Salary: $%{y:,.0f}<extra></extra>'
))

# --------------------------------------------------
# 6. Style the layout
# --------------------------------------------------
fig.update_layout(
    title=dict(text='KMeans Clustering by Average Salary per State', font=dict(size=18)),
    xaxis=dict(title='State Index (for plotting only)', tickfont=dict(size=12)),
    yaxis=dict(title='Average Salary ($)', tickfont=dict(size=12)),
    font=dict(family='Verdana', size=14),
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    hovermode='closest'
)

fig.show()


  df_clean = pd.read_csv("lightcast_job_postings.csv")  # adjust path if needed


Salary-related columns found: ['SALARY', 'SALARY_TO', 'SALARY_FROM']


In [None]:
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import numpy as np

# Prepare the data
X = df_clean[['STATE_INDEX', 'SALARY']].dropna()
soc_labels = df_clean.loc[X.index, 'SOC']

# Perform KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_

# Define colors
colors = ['#3D7C6A', '#B14E53', '#297C8A']

# Create figure
fig = go.Figure()

# Add scatter plots for each cluster
for i in range(3):
    mask = clusters == i
    fig.add_trace(go.Scatter(
        x=X.loc[mask, 'STATE_NAME'],
        y=X.loc[mask, 'SALARY'],
        mode='markers',
        name=f'Cluster {i + 1}',
        marker=dict(
            color=colors[i],
            size=8,
            opacity=0.6
        ),
        text=[f'SOC: {soc}<br>State Index: {si:.1f}<br>Salary: ${sal:,.0f}' 
              for soc, si, sal in zip(soc_labels[mask], 
                                      X.loc[mask, 'STATE_NAME'], 
                                      X.loc[mask, 'SALARY'])],
        hovertemplate='%{text}<extra></extra>'
    ))

# Add centroids
fig.add_trace(go.Scatter(
    x=centroids[:, 0],
    y=centroids[:, 1],
    mode='markers',
    name='Centroids',
    marker=dict(
        color='black',
        size=15,
        symbol='x',
        line=dict(width=2)
    ),
    hovertemplate='Centroid<br>State : %{x:.1f}<br>Salary: $%{y:,.0f}<extra></extra>'
))

# Update layout
fig.update_layout(
    title=dict(
        text='KMeans Clustering: State Index vs Salary by SOC',
        font=dict(family='Verdana', size=18)
    ),
    xaxis=dict(
        title=dict(text='State Index', font=dict(family='Verdana', size=14)),
        tickfont=dict(family='Verdana', size=14)
    ),
    yaxis=dict(
        title=dict(text='Salary ($)', font=dict(family='Verdana', size=14)),
        tickfont=dict(family='Verdana', size=14)
    ),
    font=dict(family='Verdana', size=14),
    legend=dict(font=dict(family='Verdana', size=14)),
    hovermode='closest',
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white'
)

# Show the plot
fig.show()

KeyError: 'SOC'