In [2]:
import pandas as pd

# Function to load and process the dictionaries
def load_dictionaries(sector_dict_path, measure_dict_path, duration_dict_path, period_dict_path):
    # Helper function to clean and load dictionary files
    def load_clean_dict(file_path, key_col, value_col):
        df = pd.read_csv(file_path, sep='\t', header=None, usecols=[key_col, value_col])
        df.columns = ['key', 'value']  # Rename for clarity
        df['key'] = df['key'].str.strip()  # Strip whitespace from keys
        df['value'] = df['value'].str.strip()  # Strip whitespace from values
        return df.set_index('key')['value'].to_dict()  # Convert to dictionary
    
    # Load all dictionaries
    sector_dict = load_clean_dict(sector_dict_path, 0, 1)
    measure_dict = load_clean_dict(measure_dict_path, 0, 1)
    duration_dict = load_clean_dict(duration_dict_path, 0, 1)
    period_dict = load_clean_dict(period_dict_path, 0, 2)  # Ensure the 3rd column is used in this dictionary

    return sector_dict, measure_dict, duration_dict, period_dict


# Function to process the series file
def process_series_file(series_file, sector_dict, measure_dict, duration_dict, period_dict):
    try:
        series_data = pd.read_csv(series_file, sep='\t', skipinitialspace=True)

        # Strip any whitespace characters from the column names
        series_data.columns = series_data.columns.str.strip()
        
        # Convert necessary columns to string and strip any whitespace from series values
        series_data['series_id'] = series_data['series_id'].astype(str).str.strip()
        series_data['sector_code'] = series_data['sector_code'].astype(str).str.strip().str.zfill(4)  # Ensure 4 digits
        series_data['measure_code'] = series_data['measure_code'].astype(str).str.strip().str.zfill(2)  # Ensure 2 digits
        series_data['duration_code'] = series_data['duration_code'].astype(str).str.strip()

        # Replace placeholder '-' with NA
        series_data.replace('-', pd.NA, inplace=True)

        # Map dictionary values to their names
        series_data['sector_name'] = series_data['sector_code'].map(sector_dict)
        series_data['measure_name'] = series_data['measure_code'].map(measure_dict)
        series_data['duration_name'] = series_data['duration_code'].map(duration_dict)
        
        # Keep only the necessary columns
        series_data = series_data[['series_id', 
                                   'sector_code', 'sector_name', 
                                   'measure_code', 'measure_name', 
                                   'duration_code', 'duration_name']]
        
        return series_data

    except KeyError as e:
        print(f"KeyError encountered: {e}. Check column names or data.")
        raise  # Re-raise the error after logging the message


# Function to process the data file
def process_data_file(data_file, series_data, period_dict):
    try:
        # Read the data file
        data = pd.read_csv(data_file, sep='\t', engine='python')

        # Strip any whitespace characters from the column names
        data.columns = data.columns.str.strip()

        # Explicitly clean the series_id column for consistency
        data['series_id'] = data['series_id'].astype(str).str.strip()

        # Map period code to name using period_dict
        data['period_name'] = data['period'].map(period_dict)

        # Merge with series data based on series_id
        merged_data = pd.merge(data, series_data, on='series_id', how='left')

        # Keep only the relevant columns
        merged_data = merged_data[['series_id', 'year', 'period', 'value', 
                                   'sector_code', 'sector_name', 
                                   'measure_code', 'measure_name', 
                                   'duration_code', 'duration_name', 
                                   'period_name']]

        return merged_data

    except KeyError as e:
        print(f"KeyError encountered: {e}. Check column names or data.")
        raise  # Re-raise the error after logging the message


# Main function to execute the process
def main():
    # Define file paths
    sector_dict_path = 'dictionaries/mp.sector.txt'
    measure_dict_path = 'dictionaries/mp.measure.txt'
    duration_dict_path = 'dictionaries/mp.duration.txt'
    period_dict_path = 'dictionaries/mp.period.txt'
    series_file = 'mp.series.txt'
    data_file = 'mp.data.1.AllData.txt'
    output_file = 'mp_processed_data.csv'

    # Load dictionaries
    print("Loading dictionaries...")
    try:
        sector_dict, measure_dict, duration_dict, period_dict = load_dictionaries(sector_dict_path, measure_dict_path, duration_dict_path, period_dict_path)
    except Exception as e:
        print(f"Error loading dictionaries: {e}")
        return  # Exit if there is an error loading the dictionaries

    # Process series file
    print("Processing series file...")
    try:
        series_data = process_series_file(series_file, sector_dict, measure_dict, duration_dict, period_dict)
    except Exception as e:
        print(f"Error processing series file: {e}")
        return  # Exit the function if error occurs in processing series file

    # Process the data file
    print("Processing data file...")
    try:
        final_data = process_data_file(data_file, series_data, period_dict)
    except Exception as e:
        print(f"Error processing data file: {e}")
        return  # Exit the function if error occurs in processing data file

    # Reorder columns
    final_data = final_data[['series_id', 'year', 'value', 
                             'sector_code', 'sector_name', 
                             'measure_code', 'measure_name', 
                             'duration_code', 'duration_name', 
                             'period', 'period_name']]

    # Save final processed data to csv
    final_data.to_csv(output_file, index=False)
    print("Data processing complete!")

if __name__ == "__main__":
    main()


Loading dictionaries...
Processing series file...
Processing data file...
Data processing complete!
