In [1]:
import pandas as pd

def main():
    # Load data
    data_file = 'pr.data.1.AllData.txt'
    series_file = 'pr.series.txt'

    data = pd.read_csv(data_file, sep='\t', engine='python')
    print(f"Data file loaded with shape: {data.shape}")
    print(data.head())

    series_info = pd.read_csv(series_file, sep='\t', engine='python')
    print(f"Series file loaded with shape: {series_info.shape}")
    print(series_info.head())

    # Dictionaries for mapping codes to text
    sector_name = {
        3000: "Manufacturing",
        3100: "Manufacturing, Durable Goods",
        3200: "Manufacturing, Nondurable Goods",
        8400: "Business",
        8500: "Nonfarm Business",
        8800: "Nonfinancial Corporations",
    }
    class_text = {3: "Employees", 6: "All workers"}
    duration_text = {
        1: "% Change same quarter 1 year ago",
        2: "% Change from previous quarter",
        3: "Index (2017=100)"
    }
    measure_text = {
    1: "Employment",
    2: "Average weekly hours",
    3: "Hours worked",
    4: "Real value-added output",
    5: "Value-added output",
    6: "Labor compensation",
    8: "Nonlabor payments",
    9: "Labor productivity (output per hour)",
    10: "Hourly compensation",
    11: "Unit labor costs",
    12: "Unit nonlabor costs",
    13: "Unit nonlabor payments",
    14: "Value-added output price deflator",
    15: "Real hourly compensation",
    16: "Output per worker",
    17: "Labor share",
    18: "Profits",
    19: "Unit profits",
    20: "Unit combined input costs",
    21: "Real sectoral output",
    22: "Sectoral output price deflator",
    23: "Sectoral output",
    }

    # Clean and merge data
    data.columns = data.columns.str.strip()
    series_info.columns = series_info.columns.str.strip()
    data['series_id'] = data['series_id'].str.strip()
    series_info['series_id'] = series_info['series_id'].str.strip()

    merged_data = pd.merge(data, series_info, on='series_id', how='left')
    print(f"Merged data shape: {merged_data.shape}")
    print(merged_data.head())

    # Apply dictionaries
    merged_data['sector_name'] = merged_data['sector_code'].map(sector_name)
    print("After applying sector_name dictionary:")
    print(merged_data[['sector_code', 'sector_name']].drop_duplicates())

    merged_data['class_text'] = merged_data['class_code'].map(class_text)
    print("After applying class_text dictionary:")
    print(merged_data[['class_code', 'class_text']].drop_duplicates())

    merged_data['duration_text'] = merged_data['duration_code'].map(duration_text)
    print("After applying duration_text dictionary:")
    print(merged_data[['duration_code', 'duration_text']].drop_duplicates())

    merged_data['measure_text'] = merged_data['measure_code'].map(measure_text)
    print("After applying measure_text dictionary:")
    print(merged_data[['measure_code', 'measure_text']].drop_duplicates())

    # Select relevant columns
    final_columns = [
        'series_id', 'year', 'period', 'value',
        'sector_code', 'sector_name',
        'class_code', 'class_text',
        'measure_code', 'measure_text',
        'duration_code', 'duration_text'
    ]
    final_data = merged_data[final_columns]
    print("Final DataFrame preview:")
    print(final_data.head())

    # Save output
    final_data.to_csv('pr_processed_data.csv', index=False)
    print("Processed data saved to 'processed_data.csv'")

if __name__ == "__main__":
    main()


Data file loaded with shape: (74891, 5)
   series_id          year period         value footnote_codes
0  PRS30006011        1988    Q01           1.9            NaN
1  PRS30006011        1988    Q02           2.2            NaN
2  PRS30006011        1988    Q03           1.9            NaN
3  PRS30006011        1988    Q04           1.1            NaN
4  PRS30006011        1988    Q05           1.8            NaN
Series file loaded with shape: (282, 12)
   series_id          sector_code  class_code  measure_code  duration_code  \
0  PRS30006011               3000           6             1              1   
1  PRS30006012               3000           6             1              2   
2  PRS30006013               3000           6             1              3   
3  PRS30006021               3000           6             2              1   
4  PRS30006022               3000           6             2              2   

  seasonal base_year  footnote_codes  begin_year begin_period  end_year 