In [5]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the data from table4a.csv
df = pd.read_csv("./code_and_data/data/excel/table_4a.csv")

# Create separate DataFrames for each income type
total_inc_cols = ['year', 'age'] + [col for col in df.columns if col.startswith('total_inc_')]
w2_inc_cols = ['year', 'age'] + [col for col in df.columns if col.startswith('w2_inc_')]
nw_inc_cols = ['year', 'age'] + [col for col in df.columns if col.startswith('nw_inc_')]

total_inc_df = df[total_inc_cols]
w2_inc_df = df[w2_inc_cols]
nw_inc_df = df[nw_inc_cols]

# Melt the DataFrames to convert percentiles to a single column
total_inc_df_melted = pd.melt(total_inc_df, id_vars=['year', 'age'], var_name='percentile', value_name='income')
w2_inc_df_melted = pd.melt(w2_inc_df, id_vars=['year', 'age'], var_name='percentile', value_name='income')
nw_inc_df_melted = pd.melt(nw_inc_df, id_vars=['year', 'age'], var_name='percentile', value_name='income')

# Create a function to plot income distributions by age
def plot_income_dist_by_age(df, income_type, ages):
    fig, axs = plt.subplots(len(ages), 1, figsize=(10, 6*len(ages)), sharex=True)
    for i, age in enumerate(ages):
        data = df[(df['age'] == age) & (df['percentile'].isin(['p10', 'p50', 'p90', 'mean']))]
        axs[i].plot(data['percentile'], data['income'], marker='o')
        axs[i].set_title(f"{income_type} Income Distribution at Age {age}")
        axs[i].set_ylabel("Income (2012 USD)")
    axs[-1].set_xlabel("Percentile")
    plt.tight_layout()
    plt.show()

# Plot income distributions for selected ages
ages = [30, 40, 50, 60]
plot_income_dist_by_age(total_inc_df_melted, "Total", ages)
plot_income_dist_by_age(w2_inc_df_melted, "Wage", ages)
plot_income_dist_by_age(nw_inc_df_melted, "Non-Wage", ages)

# Calculate mean incomes by age and income type
mean_total_inc_by_age = total_inc_df.groupby('age')['total_inc_mean'].mean().reset_index()
mean_w2_inc_by_age = w2_inc_df.groupby('age')['w2_inc_mean'].mean().reset_index()
mean_nw_inc_by_age = nw_inc_df.groupby('age')['nw_inc_mean'].mean().reset_index()

# Plot mean incomes by age and income type
plt.figure(figsize=(10, 6))
plt.plot(mean_total_inc_by_age['age'], mean_total_inc_by_age['total_inc_mean'], label='Total Income')
plt.plot(mean_w2_inc_by_age['age'], mean_w2_inc_by_age['w2_inc_mean'], label='Wage Income')
plt.plot(mean_nw_inc_by_age['age'], mean_nw_inc_by_age['nw_inc_mean'], label='Non-Wage Income')
plt.xlabel("Age")
plt.ylabel("Mean Income (2012 USD)")
plt.title("Mean Income by Age and Income Type")
plt.legend()
plt.show()

Index(['total_inc_p10', 'total_inc_p20', 'total_inc_p30', 'total_inc_p40',
       'total_inc_p50', 'total_inc_p60', 'total_inc_p70', 'total_inc_p80',
       'total_inc_p90', 'total_inc_p99', 'total_inc_mean'],
      dtype='object')


KeyError: "The following 'id_vars' are not present in the DataFrame: ['age', 'year']"