# Analysis (Cleaned)

Replicates the visualizations from `analysis.ipynb` using the real-world datasets generated by `data_loading_script.ipynb` (cached under `data_cache/`).

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.rcParams.update({
    "figure.figsize": (16, 10),
    "figure.dpi": 300,
    "savefig.dpi": 400,
    "axes.grid": True,
    "axes.titlesize": 18,
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 11,
})

DATA_DIR = Path("data_cache")
master_df = pd.read_csv(DATA_DIR / "master_real_world_metrics.csv")

# Normalize metro naming for plotting
master_df = master_df.rename(columns={
    "area_name": "Metro",
    "median_gross_rent": "Median_Rent",
})

# Ensure numeric types
wage_cols = ["Biomedical Engineers", "Electrical Engineers", "Management Analysts", "Software Developers"]
for col in wage_cols:
    master_df[col] = pd.to_numeric(master_df[col], errors="coerce")
master_df["Median_Rent"] = pd.to_numeric(master_df["Median_Rent"], errors="coerce")

# Derived metrics per profession
prof_map = {
    "Engineering": "Electrical Engineers",
    "Biomed_Engineer": "Biomedical Engineers",
    "Tech": "Software Developers",
    "Finance": "Management Analysts",
}

for prof, wage_col in prof_map.items():
    monthly_salary = master_df[wage_col] / 12
    annual_rent = master_df["Median_Rent"] * 12
    master_df[f"{prof}_Salary"] = master_df[wage_col]
    master_df[f"{prof}_Monthly_Salary"] = monthly_salary
    master_df[f"{prof}_Housing_Burden"] = (master_df["Median_Rent"] / monthly_salary) * 100
    master_df[f"{prof}_Salary_Efficiency"] = master_df[wage_col] / annual_rent

master_df.head()


## Visualization 1: Salary efficiency by metro

Ratio of annual wage to annual rent (higher is better). Selected metros are highlighted.

In [None]:

highlight_metros = [
    'Cleveland-Elyria, OH', 'San Francisco-Oakland-Berkeley, CA',
    'Seattle-Tacoma-Bellevue, WA', 'Boston-Cambridge-Newton, MA-NH',
    'Austin-Round Rock-Georgetown, TX', 'Pittsburgh, PA'
]
colors = {
    'Engineering': '#FF6B35',
    'Biomed Engineer': '#4ECDC4',
    'Tech': '#F38181',
    'Finance': '#AA96DA'
}

fig, ax = plt.subplots(figsize=(16, 10))

for prof, wage_col in prof_map.items():
    eff_col = f"{prof}_Salary_Efficiency"
    subset = master_df[['Metro', eff_col]].dropna().sort_values(eff_col, ascending=False)
    label = prof.replace('_', ' ')
    ax.plot(range(len(subset)), subset[eff_col], label=label, color=colors.get(label))

for metro in highlight_metros:
    if metro in list(master_df['Metro']):
        idx = master_df[master_df['Metro'] == metro].index[0]
        eff = master_df.loc[idx, 'Engineering_Salary_Efficiency']
        ax.scatter(idx, eff, color='black', zorder=5)
        ax.annotate(metro.split(',')[0], (idx, eff), textcoords="offset points", xytext=(0,10), ha='center')

ax.set_xticks([])
ax.set_ylabel('Salary efficiency (annual wage / annual rent)')
ax.set_title('Metro salary efficiency across professions')
ax.legend()
plt.show()


## Visualization 2: Cost-adjusted salary rankings

Top metros by salary efficiency for each profession.

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(20, 12))
axes = axes.flatten()

for ax, (prof, wage_col) in zip(axes, prof_map.items()):
    eff_col = f"{prof}_Salary_Efficiency"
    top_cities = master_df[['Metro', eff_col]].dropna().sort_values(eff_col, ascending=False).head(12)
    label = prof.replace('_', ' ')
    sns.barplot(data=top_cities, y='Metro', x=eff_col, ax=ax, palette='crest')
    ax.set_title(f"{label}: top salary efficiency")
    ax.set_xlabel('Salary efficiency (annual wage / annual rent)')
    ax.set_ylabel('')

plt.tight_layout()
plt.show()


## Visualization 3: Housing burden by profession for key metros

In [None]:

comparison_metros = [
    'Cleveland-Elyria, OH', 'Pittsburgh, PA', 'Austin-Round Rock-Georgetown, TX',
    'Denver-Aurora-Lakewood, CO', 'Boston-Cambridge-Newton, MA-NH',
    'Seattle-Tacoma-Bellevue, WA', 'New York-Newark-Jersey City, NY-NJ-PA',
    'San Francisco-Oakland-Berkeley, CA'
]

subset = master_df[master_df['Metro'].isin(comparison_metros)].copy()
plot_rows = []
for _, row in subset.iterrows():
    for prof in prof_map.keys():
        plot_rows.append({
            'Metro': row['Metro'],
            'Profession': prof.replace('_',' '),
            'Housing_Burden_Pct': row[f'{prof}_Housing_Burden']
        })
plot_df = pd.DataFrame(plot_rows)

plt.figure(figsize=(18, 10))
sns.barplot(data=plot_df, x='Housing_Burden_Pct', y='Metro', hue='Profession', palette=colors)
plt.xlabel('Housing burden (% of salary to cover median rent)')
plt.ylabel('Metro')
plt.title('Rent burden comparison across professions')
plt.legend(title='Profession')
plt.tight_layout()
plt.show()
