In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Parameters
n_counties = 58
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
genders = ["Male", "Female"]

# Generate county-level info
counties = pd.DataFrame({
    "county_code": range(1, n_counties + 1),
    "county_name": [f"County_{i}" for i in range(1, n_counties + 1)],
    "region": np.random.choice(["North", "Central", "South"], size=n_counties),
    "urban": np.random.choice([0, 1], size=n_counties)
})

# --- Dataset 2: county_education_2024 ---
education_data = counties.copy()
education_data["education_avg"] = np.round(np.random.normal(13.5, 1.2, n_counties), 2)
education_data["college_share"] = np.clip(np.random.normal(0.35, 0.1, n_counties), 0, 1)
education_data["median_income"] = np.round(np.random.normal(60000, 12000, n_counties), 0)

# --- Dataset 1: county_wages_2024 (DAILY DATA) ---

# Generate all dates in 2024
all_dates = pd.date_range("2024-01-01", "2024-12-31", freq="D")

# Randomly select one month and one county to skip for data cleaning practice in STATA
missing_month = np.random.choice(months)
missing_county_code = np.random.choice(counties["county_code"])

rows = []
for _, c in counties.iterrows():
    for date in all_dates:
        # Skip the entire randomly selected month for the missing county
        # Convert date to pd.Timestamp to access month attribute
        if c["county_code"] == missing_county_code and pd.Timestamp(date).month == pd.Timestamp(missing_month).month:
            continue

        for g in genders:
            base_wage = np.random.normal(30, 5)
            gender_effect = -3 if g == "Female" else 0
            #This is based on existing literature that women earn $3 less than men. Check References [1]
            
            education_effect = (education_data.loc[
                education_data["county_code"] == c["county_code"], "education_avg"
            ].values[0] - 12) * 0.8
            #This is based on The Mincer equation which suggests that each additional year of education produces an individual a rate of return to 
            #schooling of about 5–8% per year, ranging from a low of 1% to more than 20% in some countries. Check References [2]
            #Hence, here I assume that counties with more educated populations have slightly higher wages, roughly +$0.8 per additional year of average schooling above 12.

            wage = np.round(base_wage + gender_effect + education_effect + np.random.normal(0, 2), 2)
            hours = np.round(np.random.normal(38, 4), 1)
            emp_rate = np.clip(np.random.normal(0.9, 0.05), 0, 1)
            population = np.random.randint(200000, 1500000)
            rows.append([
                c["county_code"], c["county_name"], date.strftime("%Y-%m-%d"),
                g, wage, hours, emp_rate, population, c["urban"]
            ])

wages_data = pd.DataFrame(rows, columns=[
    "county_code", "county_name", "date", "gender", "wage",
    "hours", "employment_rate", "population", "urban"
])

# --- Save as CSVs ---
wages_data.to_csv("county_wages_2024.csv", index=False)
education_data.to_csv("county_education_2024.csv", index=False)

print("Files saved as county_wages_2024.csv and county_education_2024.csv")
print("Wage dataset shape:", wages_data.shape)
print("Education dataset shape:", education_data.shape)
print(f"\nExpected rows: {n_counties} counties × 366 days (2024 is a leap year) × {len(genders)} genders - missing month")
print(f"Actual rows: {len(wages_data)}")

Files saved as county_wages_2024.csv and county_education_2024.csv
Wage dataset shape: (42394, 9)
Education dataset shape: (58, 7)

Expected rows: 58 counties × 366 days (2024 is a leap year) × 2 genders - missing month
Actual rows: 42394
