# QCEW Data Exploration

This notebook loads QCEW CSV files and examines the data structure, columns, and data types.

In [3]:
import pandas as pd
import os
from pathlib import Path
import sys
sys.path.append('../src')
from data_download import main as download_data

# Set up data directory
data_dir = Path('../data/raw')

# Check if data exists, if not, download
csv_files = list(data_dir.glob('*.csv'))
if not csv_files:
    print("No CSV files found. Downloading data...")
    download_data()
    csv_files = list(data_dir.glob('*.csv'))

# List all CSV files
print(f"Found {len(csv_files)} CSV files:")
for file in sorted(csv_files):
    print(f"  {file.name}")

# Load the first file to examine structure
if csv_files:
    sample_file = sorted(csv_files)[0]
    print(f"\nLoading sample file: {sample_file.name}")
    df = pd.read_csv(sample_file)
    
    print(f"\nData shape: {df.shape}")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nData types:\n{df.dtypes}")
    
    print(f"\nFirst 5 rows:")
    display(df.head())
    
    print(f"\nSummary statistics:")
    display(df.describe())
else:
    print("No CSV files found!")

Found 20 CSV files:
  qcew_06000_2020_q1.csv
  qcew_06000_2020_q2.csv
  qcew_06000_2020_q3.csv
  qcew_06000_2020_q4.csv
  qcew_06000_2021_q1.csv
  qcew_06000_2021_q2.csv
  qcew_06000_2021_q3.csv
  qcew_06000_2021_q4.csv
  qcew_06000_2022_q1.csv
  qcew_06000_2022_q2.csv
  qcew_06000_2022_q3.csv
  qcew_06000_2022_q4.csv
  qcew_06000_2023_q1.csv
  qcew_06000_2023_q2.csv
  qcew_06000_2023_q3.csv
  qcew_06000_2023_q4.csv
  qcew_06000_2024_q1.csv
  qcew_06000_2024_q2.csv
  qcew_06000_2024_q3.csv
  qcew_06000_2024_q4.csv

Loading sample file: qcew_06000_2020_q1.csv

Data shape: (3199, 42)

Columns: ['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code', 'year', 'qtr', 'disclosure_code', 'qtrly_estabs', 'month1_emplvl', 'month2_emplvl', 'month3_emplvl', 'total_qtrly_wages', 'taxable_qtrly_wages', 'qtrly_contributions', 'avg_wkly_wage', 'lq_disclosure_code', 'lq_qtrly_estabs', 'lq_month1_emplvl', 'lq_month2_emplvl', 'lq_month3_emplvl', 'lq_total_qtrly_wages', 'lq_taxable_qtrly_wa

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,qtrly_estabs,month1_emplvl,...,oty_month3_emplvl_chg,oty_month3_emplvl_pct_chg,oty_total_qtrly_wages_chg,oty_total_qtrly_wages_pct_chg,oty_taxable_qtrly_wages_chg,oty_taxable_qtrly_wages_pct_chg,oty_qtrly_contributions_chg,oty_qtrly_contributions_pct_chg,oty_avg_wkly_wage_chg,oty_avg_wkly_wage_pct_chg
0,6000,0,10,50,0,2020,1,,1604765,17609773,...,120120,0.7,18441242800,5.8,2458278545,2.9,72163997,2.0,58,4.1
1,6000,1,10,51,0,2020,1,,3340,248885,...,5807,2.4,177793902,3.6,0,0.0,0,0.0,24,1.6
2,6000,1,101,52,0,2020,1,,1,3,...,2,200.0,43851,146.8,0,0.0,0,0.0,-407,-17.7
3,6000,1,1013,53,0,2020,1,,1,3,...,2,200.0,43851,146.8,0,0.0,0,0.0,-407,-17.7
4,6000,1,102,52,0,2020,1,,3339,248882,...,5805,2.4,177750051,3.6,0,0.0,0,0.0,24,1.6



Summary statistics:


Unnamed: 0,area_fips,own_code,agglvl_code,size_code,year,qtr,qtrly_estabs,month1_emplvl,month2_emplvl,month3_emplvl,...,oty_month3_emplvl_chg,oty_month3_emplvl_pct_chg,oty_total_qtrly_wages_chg,oty_total_qtrly_wages_pct_chg,oty_taxable_qtrly_wages_chg,oty_taxable_qtrly_wages_pct_chg,oty_qtrly_contributions_chg,oty_qtrly_contributions_pct_chg,oty_avg_wkly_wage_chg,oty_avg_wkly_wage_pct_chg
count,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,...,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0,3199.0
mean,6000.0,4.0794,57.036261,0.0,2020.0,1.0,4526.205,49970.75,50384.83,49796.85,...,341.082213,1.786027,52530640.0,5.87121,6909198.0,5.707721,189602.6,2.089997,43.964051,3.859206
std,0.0,1.406435,1.345527,0.0,0.0,0.0,53890.74,494638.7,498781.1,492212.9,...,4725.360532,30.465697,540405100.0,24.485398,76450660.0,92.147669,3884892.0,64.530005,319.158286,14.519247
min,6000.0,0.0,50.0,0.0,2020.0,1.0,0.0,0.0,0.0,0.0,...,-45348.0,-100.0,-1042577000.0,-100.0,-61480200.0,-98.4,-16132720.0,-98.9,-8718.0,-100.0
25%,6000.0,3.0,56.0,0.0,2020.0,1.0,19.0,306.5,301.0,296.0,...,-76.0,-2.6,0.0,0.0,-29618.5,-0.4,-69740.0,-5.7,0.0,0.0
50%,6000.0,5.0,57.0,0.0,2020.0,1.0,135.0,2948.0,2993.0,2926.0,...,0.0,0.0,1013425.0,3.6,0.0,0.0,0.0,0.0,33.0,3.1
75%,6000.0,5.0,58.0,0.0,2020.0,1.0,819.0,14859.0,15028.5,14666.0,...,122.0,2.8,11734690.0,8.2,1175854.0,4.0,224.0,0.1,72.0,6.0
max,6000.0,8.0,96.0,0.0,2020.0,1.0,1604765.0,17609770.0,17756210.0,17545680.0,...,120120.0,600.0,18441240000.0,387.4,2458279000.0,3585.0,80556780.0,2086.9,6134.0,315.6
