In [2]:
import pandas as pd
import traceback

# Function to load and process the dictionaries
def load_dictionaries(ages_dict_path, class_dict_path, earn_dict_path, education_dict_path, fips_dict_path, indy_dict_path, lfst_dict_path, occupation_dict_path, pcts_dict_path, race_dict_path, seasonal_dict_path, sexs_dict_path, tdata_dict_path
):
    # Helper function to clean and load dictionary files
    def load_clean_dict(file_path, key_col, value_col):
        try:
            df = pd.read_csv(file_path, sep='\t', header=None, usecols=[key_col, value_col])
            df.columns = ['key', 'value']  # Rename for clarity
            df['key'] = df['key'].astype(str).str.strip()  # Strip whitespace from keys
            df['value'] = df['value'].str.strip()  # Strip whitespace from values
            return df.set_index('key')['value'].to_dict()  # Convert to dictionary
        except FileNotFoundError as e:
            print(f"FileNotFoundError: Could not find the file '{file_path}'. Please check the path.")
            print(traceback.format_exc())
            raise
        except pd.errors.ParserError as e:
            print(f"ParserError: Failed to parse the file '{file_path}'. Ensure it is tab-delimited and formatted correctly.")
            print(traceback.format_exc())
            raise
        except Exception as e:
            print(f"Unexpected error loading dictionary file '{file_path}': {e}")
            print(traceback.format_exc())
            raise
    
    # Load all dictionaries
    ages_dict = load_clean_dict(ages_dict_path, 0, 1)
    class_dict = load_clean_dict(class_dict_path, 0, 1)
    earn_dict = load_clean_dict(earn_dict_path, 0, 1)
    education_dict = load_clean_dict(education_dict_path, 0, 1)
    fips_dict = load_clean_dict(fips_dict_path, 0, 1)
    indy_dict = load_clean_dict(indy_dict_path, 0, 1)
    lfst_dict = load_clean_dict(lfst_dict_path, 0, 1)
    occupation_dict = load_clean_dict(occupation_dict_path, 0, 1)
    pcts_dict = load_clean_dict(pcts_dict_path, 0, 1)
    race_dict = load_clean_dict(race_dict_path, 0, 1)
    seasonal_dict = load_clean_dict(seasonal_dict_path, 0, 1) 
    sexs_dict = load_clean_dict(sexs_dict_path, 0, 1)
    tdata_dict = load_clean_dict(tdata_dict_path, 0, 1)
    
    return ages_dict, class_dict, earn_dict, education_dict, fips_dict, indy_dict, lfst_dict, occupation_dict, pcts_dict, race_dict, seasonal_dict, sexs_dict, tdata_dict

# Function to process the series file
def process_series_file(series_file, ages_dict, class_dict, earn_dict, education_dict, fips_dict, indy_dict, lfst_dict, occupation_dict, pcts_dict, race_dict, seasonal_dict, sexs_dict, tdata_dict
):
    try:
        series_data = pd.read_csv(series_file, sep='\t', skipinitialspace=True)

        # Strip any whitespace characters from the column names and seriesid values
        series_data.columns = series_data.columns.str.strip()
        
        # Define columns of interest
        required_columns = ['series_id', 'ages_code', 'class_code', 'earn_code', 
                            'education_code', 'fips_code', 'indy_code', 'lfst_code', 
                            'occupation_code', 'pcts_code', 'race_code', 'seasonal', 
                            'sexs_code', 'tdata_code']

        # Debugging: Check for missing columns before proceeding
        missing_columns = [col for col in required_columns if col not in series_data.columns]
        if missing_columns:
            print(f"Error: Missing columns in series file: {missing_columns}")
            print(f"Available columns: {list(series_data.columns)}")
            return None  # Return None explicitly if required columns are missing

        # Narrow the dataframe to only the required columns
        series_data = series_data[required_columns]

        # Strip trailing whitespace in `series_id` values
        series_data['series_id'] = series_data['series_id'].astype(str).str.strip()
        
        # Convert additional columns to string to strip any whitespace from values, and force correct number of digits for dictionary matching (force read preceding 0s)
        series_data['ages_code'] = series_data['ages_code'].astype(str).str.strip().str.zfill(2) 
        series_data['class_code'] = series_data['class_code'].astype(str).str.strip().str.zfill(2)
        series_data['earn_code'] = series_data['earn_code'].astype(str).str.strip().str.zfill(2)
        series_data['education_code'] = series_data['education_code'].astype(str).str.strip().str.zfill(2)
        series_data['fips_code'] = series_data['fips_code'].astype(str).str.strip().str.zfill(2)
        series_data['indy_code'] = series_data['indy_code'].astype(str).str.strip().str.zfill(4)
        series_data['lfst_code'] = series_data['lfst_code'].astype(str).str.strip().str.zfill(2)
        series_data['occupation_code'] = series_data['occupation_code'].astype(str).str.strip().str.zfill(4)
        series_data['pcts_code'] = series_data['pcts_code'].astype(str).str.strip().str.zfill(2)
        series_data['race_code'] = series_data['race_code'].astype(str).str.strip().str.zfill(2)
        series_data['seasonal'] = series_data['seasonal'].astype(str).str.strip()
        series_data['sexs_code'] = series_data['sexs_code'].astype(str).str.strip().str.zfill(1)
        series_data['tdata_code'] = series_data['tdata_code'].astype(str).str.strip().str.zfill(2)

        # Replace placeholder '-' with NA
        series_data.replace('-', pd.NA, inplace=True)

        # Map dictionary values to their names
        series_data['age'] = series_data['ages_code'].map(ages_dict)
        series_data['class'] = series_data['class_code'].map(class_dict)
        series_data['earnings_series'] = series_data['earn_code'].map(earn_dict)
        series_data['education'] = series_data['education_code'].map(education_dict)
        series_data['fips'] = series_data['fips_code'].map(fips_dict)
        series_data['industry'] = series_data['indy_code'].map(indy_dict)
        series_data['labor_force_status'] = series_data['lfst_code'].map(lfst_dict)
        series_data['occupation'] = series_data['occupation_code'].map(occupation_dict)
        series_data['percentage'] = series_data['pcts_code'].map(pcts_dict)
        series_data['race'] = series_data['race_code'].map(race_dict)
        series_data['seasonal'] = series_data['seasonal'].map(seasonal_dict)
        series_data['sex'] = series_data['sexs_code'].map(sexs_dict)
        series_data['data_type'] = series_data['tdata_code'].map(tdata_dict)

        # Return processed DataFrame
        print("Series data processed successfully.")
        return series_data

    except Exception as e:
        print(f"Error in process_series_file: {e}")
        print(traceback.format_exc())
        return None

# Function to process the data file
def process_AllData_file(AllData_file, series_data, start_year):
    try:
        # Read the data file
        data = pd.read_csv(AllData_file, sep='\t', engine='python')

        # Strip any whitespace characters from the column names
        data.columns = data.columns.str.strip()

        # Explicitly clean the series_id column for consistency
        data['series_id'] = data['series_id'].astype(str).str.strip()

        # Convert the year column to numeric for filtering and filter rows based on the start_year
        data['year'] = pd.to_numeric(data['year'], errors='coerce')
        data = data[data['year'] >= start_year]

        # Merge with series data based on series_id
        merged_data = pd.merge(data, series_data, on='series_id', how='left')
        # Debugging
        unmatched_ids = data[~data['series_id'].isin(series_data['series_id'])]
        if not unmatched_ids.empty:
            print(f"Warning: {len(unmatched_ids)} series_id values in the data file do not exist in the series file. Check these IDs:\n{unmatched_ids['series_id'].unique()}")

        # Keep only the relevant columns
        final_data = merged_data[['series_id', 'year', 'period', 'value', 
                                  'earnings_series', 'age', 'class', 'education', 'fips', 'industry', 'labor_force_status', 'occupation', 'percentage', 'race', 'seasonal', 'sex', 'data_type']]

        return final_data

    except KeyError as e:
        print(f"KeyError: Missing column '{e.args[0]}'. Available columns: {list(data.columns)}")
        print(traceback.format_exc())
        raise
    except Exception as e:
        print(f"Unexpected error while processing AllData file: {e}")
        print(traceback.format_exc())
        raise

# Main function to execute the process
def main():

    try:

        # Define the start year for filtering to decrease output file size; change as needed (first available year: 1979)
        start_year = 2014  

        # Define dictionary file paths
        ages_dict_path = 'dictionaries/le.ages.txt'
        class_dict_path = 'dictionaries/le.class.txt'
        earn_dict_path = 'dictionaries/le.earn.txt'
        education_dict_path = 'dictionaries/le.education.txt'
        fips_dict_path = 'dictionaries/le.fips.txt'
        indy_dict_path = 'dictionaries/le.indy.txt'
        lfst_dict_path = 'dictionaries/le.lfst.txt'
        occupation_dict_path = 'dictionaries/le.occupation.txt'
        pcts_dict_path = 'dictionaries/le.pcts.txt'
        race_dict_path = 'dictionaries/le.race.txt'
        seasonal_dict_path = 'dictionaries/le.seasonal.txt'
        sexs_dict_path = 'dictionaries/le.sexs.txt'
        tdata_dict_path = 'dictionaries/le.tdata.txt'

        # Define series input file 
        series_file = 'le.series.txt'
        
        # Define data input file and corresponding output file
        input_output_files = [
            ('le.data.1.AllData.txt', 'le_processed_data.csv')
        ]

        # Load dictionaries
        print("Loading dictionaries...")
        try:
            ages_dict, class_dict, earn_dict, education_dict, fips_dict, indy_dict, lfst_dict, occupation_dict, pcts_dict, race_dict, seasonal_dict, sexs_dict, tdata_dict = load_dictionaries(
                ages_dict_path, class_dict_path, earn_dict_path, education_dict_path, 
                fips_dict_path, indy_dict_path, lfst_dict_path, occupation_dict_path, 
                pcts_dict_path, race_dict_path, seasonal_dict_path, sexs_dict_path, 
                tdata_dict_path
            )
        except Exception as e:
            print(f"Error loading dictionaries: {e}")
            traceback.print_exc()
            return  # Exit if there is an error loading the dictionaries
        
        # Process series file
        print("Processing series file...")
        try:
            series_data = process_series_file(
                series_file, ages_dict, class_dict, earn_dict, education_dict, 
                fips_dict, indy_dict, lfst_dict, occupation_dict, pcts_dict, 
                race_dict, seasonal_dict, sexs_dict, tdata_dict
            )
        except Exception as e:
            print(f"Error processing series file: {e}")
            traceback.print_exc()
            return  # Exit the function if error occurs in processing series file

        # Process data input file, save to CSV output file, and print the number of rows
        for input_file, output_file in input_output_files:
            print(f"Processing {input_file}...")
            try:
                final_data = process_AllData_file(input_file, series_data, start_year)
                final_data.to_csv(output_file, index=False)
                print(f"Saved processed data to {output_file}. Number of rows: {len(final_data)}")
            except Exception as e:
                print(f"Error processing {input_file}: {e}")
                traceback.print_exc()
                return
    
    except Exception as e:
        print(f"Unhandled exception in main(): {e}")
        traceback.print_exc()
        raise  # Re-raise the exception to halt execution

if __name__ == "__main__":
    main()
    

Loading dictionaries...
Processing series file...
Series data processed successfully.
Processing le.data.1.AllData.txt...
Saved processed data to le_processed_data.csv. Number of rows: 114977
