In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_raw = pd.read_csv('../raw_data/csv/eqls_2007and2011.csv')
df_processed = pd.read_csv("../data/eqls_processed.csv")

In [None]:
df_processed.shape

In [None]:
df_processed.columns

In [None]:
df_processed

In [None]:
# Label mapping
label_map = {
    1.0: "Employed",
    2.0: "Unemployed",
    3.0: "Unable to work",
    4.0: "Retired",
    5.0: "Homemaker",
    6.0: "Student",
    7.0: "Other"
}

# Normalized counts
counts_raw = df_raw['Y11_EmploymentStatus'].value_counts(normalize=True)
counts_processed = df_processed['Y11_EmploymentStatus'].value_counts(normalize=True)

# Align categories by value (1–7)
all_idx = sorted(label_map.keys())
counts_raw = counts_raw.reindex(all_idx, fill_value=0)
counts_processed = counts_processed.reindex(all_idx, fill_value=0)

# Plot
plt.style.use("ggplot")
fig, ax = plt.subplots(figsize=(10, 6))

x = range(len(all_idx))
bar_width = 0.4

ax.bar([i - bar_width/2 for i in x], counts_raw, width=bar_width, label="Raw", alpha=0.8)
ax.bar([i + bar_width/2 for i in x], counts_processed, width=bar_width, label="Processed", alpha=0.8)

ax.set_xticks(list(x))
ax.set_xticklabels([label_map[val] for val in all_idx])
ax.set_ylabel("Proportion")
ax.legend()

fig.tight_layout()
fig.savefig("../figs/employment_status_comparison.svg", format="svg", bbox_inches="tight")