In [None]:
import pandas as pd

# Function to load and process the dictionaries
def load_dictionaries(supersector_dict_path, industry_dict_path, datatype_dict_path, period_dict_path):
    # Helper function to clean and load dictionary files
    def load_clean_dict(file_path, key_col, value_col):
        df = pd.read_csv(file_path, sep='\t', header=None, usecols=[key_col, value_col])
        df.columns = ['key', 'value']  # Rename for clarity
        df['key'] = df['key'].str.strip()  # Strip whitespace from keys
        df['value'] = df['value'].str.strip()  # Strip whitespace from values
        return df.set_index('key')['value'].to_dict()  # Convert to dictionary

    # Load all dictionaries
    supersector_dict = load_clean_dict(supersector_dict_path, 0, 1)
    industry_dict = load_clean_dict(industry_dict_path, 0, 3)
    datatype_dict = load_clean_dict(datatype_dict_path, 0, 1)
    period_dict = load_clean_dict(period_dict_path, 0, 2)  

    return supersector_dict, industry_dict, datatype_dict, period_dict


# Function to process the series file
def process_series_file(series_file, supersector_dict, industry_dict, datatype_dict):
    try:
        series_data = pd.read_csv(series_file, sep='\t', skipinitialspace=True)

        # Strip any whitespace characters from the column names
        series_data.columns = series_data.columns.str.strip()
        
        # Convert necessary columns to string and strip any whitespace from series values
        series_data['series_id'] = series_data['series_id'].astype(str).str.strip()
        series_data['supersector_code'] = series_data['supersector_code'].astype(str).str.strip().str.zfill(2) 
        series_data['industry_code'] = series_data['industry_code'].astype(str).str.strip().str.zfill(8)
        series_data['data_type_code'] = series_data['data_type_code'].astype(str).str.strip().str.zfill(2)
        
        # Replace placeholder '-' with NA
        series_data.replace('-', pd.NA, inplace=True)

        # Map dictionary values to their names
        series_data['supersector'] = series_data['supersector_code'].map(supersector_dict)
        series_data['industry'] = series_data['industry_code'].map(industry_dict)
        series_data['datatype'] = series_data['data_type_code'].map(datatype_dict)
        
        # Keep only the necessary columns
        series_data = series_data[['series_id', 
                                   'supersector_code', 'supersector', 
                                   'industry_code', 'industry', 
                                   'data_type_code', 'datatype']]
        
        return series_data

    except KeyError as e:
        print(f"KeyError encountered: {e}. Check column names or data.")
        raise  # Re-raise the error after logging the message


# Function to process the AllEarnAE data file
def process_AllEarnAE_file(AllEarnAE_file, series_data, period_dict, start_year):
    try:
        # Read the data file
        data = pd.read_csv(AllEarnAE_file, sep='\t', engine='python')

        # Strip any whitespace characters from the column names
        data.columns = data.columns.str.strip()

        # Explicitly clean the series_id column for consistency
        data['series_id'] = data['series_id'].astype(str).str.strip()

        # Convert the year column to numeric for filtering and filter rows based on the start_year
        data['year'] = pd.to_numeric(data['year'], errors='coerce')
        data = data[data['year'] >= start_year]

        # Map period code to name using period_dict
        data['period'] = data['period'].map(period_dict)

        # Merge with series data based on series_id
        merged1_data = pd.merge(data, series_data, on='series_id', how='left')

        # Keep only the relevant columns
        final1_data = merged1_data[['series_id', 'year', 'value', 
                                   'supersector', 'industry', 'datatype', 'period']]

        return final1_data

    except KeyError as e:
        print(f"KeyError encountered: {e}. Check column names or data.")
        raise  # Re-raise the error after logging the message

# Function to process the ...HrsEarn data files
def process_HrsEarn_file(input_file, series_data, period_dict, start_year):
    try:
        # Read the data file
        data = pd.read_csv(input_file, sep='\t', engine='python')

        # Strip any whitespace characters from the column names
        data.columns = data.columns.str.strip()

        # Explicitly clean the series_id column for consistency
        data['series_id'] = data['series_id'].astype(str).str.strip()

        # Convert the year column to numeric for filtering and filter rows based on the start_year
        data['year'] = pd.to_numeric(data['year'], errors='coerce')
        data = data[data['year'] >= start_year]

        # Map period code to name using period_dict
        data['period'] = data['period'].map(period_dict)

        # Merge with series data based on series_id
        merged_data = pd.merge(data, series_data, on='series_id', how='left')

        # Keep only the relevant columns
        final_data = merged_data[['series_id', 'year', 'value', 
                                  'supersector', 'industry', 'datatype', 'period']]

        # Filter out rows where datatype starts with "INDEXES"
        final_data = final_data[~final_data['datatype'].str.startswith("INDEXES", na=False)]

        return final_data

    except KeyError as e:
        print(f"KeyError encountered: {e}. Check column names or data.")
        raise

# Main function to execute the process
def main():
    # Define the start year for filtering to decrease output file size; change as needed (first available year is 2006)
    start_year = 2014  

    # Define file paths
    supersector_dict_path = 'dictionaries/ce.supersector.txt'
    industry_dict_path = 'dictionaries/ce.industry.txt'
    datatype_dict_path = 'dictionaries/ce.datatype.txt'
    period_dict_path = 'dictionaries/ce.period.txt'
    series_file = 'ce.series.txt'

    # Define input files and corresponding output files
    input_output_files = [
        ('ce.data.02b.AllRealEarningsAE.txt', 'ce_AllEarnAE_processed_data.csv'),
        ('ce.data.05b.TotalPrivate.AllEmployeeHoursAndEarnings.txt', 'ce_TotalPrivateAEHrsEarn_processed_data.csv'),
        ('ce.data.50b.Information.AllEmployeeHoursAndEarnings.txt', 'ce_InfoAEHrsEarn_processed_data.csv'),
        ('ce.data.55b.FinancialActivities.AllEmployeeHoursAndEarnings.txt', 'ce_FinlAEHrsEarn_processed_data.csv'),
        ('ce.data.60b.ProfessionalBusinessServices.AllEmployeeHoursAndEarnings.txt', 'ce_ProfAEHrsEarn_processed_data.csv'),
        ('ce.data.65b.EducationAndHealthCare.AllEmployeeHoursAndEarnings.txt', 'ce_EducHlthAEHrsEarn_processed_data.csv'),
        ('ce.data.70b.LeisureAndHospitality.AllEmployeeHoursAndEarnings.txt', 'ce_LeisAEHrsEarn_processed_data.csv'),
        ('ce.data.80b.OtherServices.AllEmployeeHoursAndEarnings.txt', 'ce_OtherAEHrsEarn_processed_data.csv')
    ]

    # Load dictionaries
    print("Loading dictionaries...")
    try:
        supersector_dict, industry_dict, datatype_dict, period_dict = load_dictionaries(supersector_dict_path, industry_dict_path, datatype_dict_path, period_dict_path)
    except Exception as e:
        print(f"Error loading dictionaries: {e}")
        return  # Exit if there is an error loading the dictionaries

    # Process series file
    print("Processing series file...")
    try:
        series_data = process_series_file(series_file, supersector_dict, industry_dict, datatype_dict)
    except Exception as e:
        print(f"Error processing series file: {e}")
        return  # Exit the function if error occurs in processing series file

    # Process each data input file, save to csv output file, print to console # rows for each resulting data set
    for input_file, output_file in input_output_files:
        print(f"Processing {input_file}...")
        try:
            final_data = process_HrsEarn_file(input_file, series_data, period_dict, start_year)
            final_data.to_csv(output_file, index=False)
            print(f"Saved processed data to {output_file}. Number of rows: {len(final_data)}")
        except Exception as e:
            print(f"Error processing {input_file}: {e}")
            return

if __name__ == "__main__":
    main()


Loading dictionaries...
Processing series file...
Processing ce.data.02b.AllRealEarningsAE.txt...
Saved processed data to ce_AllEarnAE_processed_data.csv. Number of rows: 302992
Processing ce.data.05b.TotalPrivate.AllEmployeeHoursAndEarnings.txt...
Saved processed data to ce_TotalPrivateAEHrsEarn_processed_data.csv. Number of rows: 5713
Processing ce.data.50b.Information.AllEmployeeHoursAndEarnings.txt...
Saved processed data to ce_InfoAEHrsEarn_processed_data.csv. Number of rows: 37534
Processing ce.data.55b.FinancialActivities.AllEmployeeHoursAndEarnings.txt...
Saved processed data to ce_FinlAEHrsEarn_processed_data.csv. Number of rows: 101318
Processing ce.data.60b.ProfessionalBusinessServices.AllEmployeeHoursAndEarnings.txt...
Saved processed data to ce_ProfAEHrsEarn_processed_data.csv. Number of rows: 135086
Processing ce.data.65b.EducationAndHealthCare.AllEmployeeHoursAndEarnings.txt...
Saved processed data to ce_EducHlthAEHrsEarn_processed_data.csv. Number of rows: 91938
Process