In [1]:
import pandas as pd

excel_file = pd.ExcelFile("../data/Beta Occupational Hazards Dataset_Dec2023.xlsx")
sheet_names = excel_file.sheet_names
sheet_names

['Information', 'BOHD Dataset', 'Variable descriptions', 'Exclusions']

In [3]:
variable_desc_df = pd.read_excel("../data/Beta Occupational Hazards Dataset_Dec2023.xlsx", sheet_name='Variable descriptions', skiprows=3)
variable_desc_df.head()

Unnamed: 0,Variable,Type,Question asked of workers/employers/occupational experts,0,25,50,75,100
0,Electronic Mail,Communication,How often do you use electronic mail in this job?,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
1,Face-to-Face Discussions,Communication,How often do you have to have face-to-face dis...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
2,Letters and Memos,Communication,How often does the job require written letters...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
3,Public Speaking,Communication,How often do you have to perform public speaki...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
4,Telephone,Communication,How often do you have telephone conversations ...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day


In [8]:
meta_data_df = variable_desc_df.copy()
meta_data_df["Variable"] = (
    meta_data_df["Variable"]
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(r"[^a-zA-Z0-9_]", "", regex=True)
)

In [9]:
meta_data_df.head()

Unnamed: 0,Variable,Type,Question asked of workers/employers/occupational experts,0,25,50,75,100
0,electronic_mail,Communication,How often do you use electronic mail in this job?,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
1,facetoface_discussions,Communication,How often do you have to have face-to-face dis...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
2,letters_and_memos,Communication,How often does the job require written letters...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
3,public_speaking,Communication,How often do you have to perform public speaki...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
4,telephone,Communication,How often do you have telephone conversations ...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day


In [10]:
physical_hazard_cols = [
    "spend_time_bending_or_twisting_the_body",
    "spend_time_climbing_ladders_scaffolds_or_poles",
    "spend_time_keeping_or_regaining_balance",
    "spend_time_kneeling_crouching_stooping_or_crawling",
    "spend_time_making_repetitive_motions",
    "spend_time_sitting",
    "spend_time_standing",
    "spend_time_walking_and_running",
    "spend_time_using_your_hands_to_handle_control_or_feel_objects_tools_or_controls",
    "cramped_work_space_awkward_positions",
    "exposed_to_contaminants",
    "exposed_to_whole_body_vibration",
    "extremely_bright_or_inadequate_lighting",
    "sounds_noise_levels_are_distracting_or_uncomfortable",
    "very_hot_or_cold_temperatures",
    "exposed_to_disease_or_infections",
    "exposed_to_hazardous_conditions",
    "exposed_to_hazardous_equipment",
    "exposed_to_high_places",
    "exposed_to_minor_burns_cuts_bites_or_stings",
    "exposed_to_radiation",
    "wear_common_protective_or_safety_equipment_such_as_safety_shoes_glasses_gloves_hearing_protection_hard_hats_or_life_jackets",
    "wear_specialized_protective_or_safety_equipment_such_as_breathing_apparatus_safety_harness_full_protection_suits_or_radiation_protection",
    "in_an_enclosed_vehicle_or_equipment",
    "in_an_open_vehicle_or_equipment"
]


psychosocial_hazard_cols = [
    "deal_with_physically_aggressive_people",
    "deal_with_unpleasant_or_angry_people",
    "frequency_of_conflict_situations",
    "responsibility_for_outcomes_and_results",
    "responsible_for_others_health_and_safety",
    "coordinate_or_lead_others",
    "deal_with_external_customers",
    "work_with_work_group_or_team",
    "physical_proximity",
    "level_of_competition",
    "consequence_of_error",
    "freedom_to_make_decisions",
    "frequency_of_decision_making",
    "impact_of_decisions_on_coworkers_or_company_results",
    "time_pressure",
    "pace_determined_by_speed_of_equipment",
    "work_schedules",
    "structured_versus_unstructured_work",
    "importance_of_repeating_same_tasks",
    "importance_of_being_exact_or_accurate",
    "degree_of_automation"
]

In [11]:
# Assign hazard type based on which list each variable belongs to
meta_data_df["hazard_type"] = meta_data_df["Variable"].apply(
    lambda var: "Physical" if var in physical_hazard_cols
    else "Psychosocial" if var in psychosocial_hazard_cols
    else "Other"
)

In [13]:
# Reorder column
cols = meta_data_df.columns.tolist()
cols.insert(2, cols.pop(cols.index("hazard_type")))  
meta_data_df = meta_data_df[cols]

In [14]:
meta_data_df.head()

Unnamed: 0,Variable,Type,hazard_type,Question asked of workers/employers/occupational experts,0,25,50,75,100
0,electronic_mail,Communication,Other,How often do you use electronic mail in this job?,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
1,facetoface_discussions,Communication,Other,How often do you have to have face-to-face dis...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
2,letters_and_memos,Communication,Other,How often does the job require written letters...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
3,public_speaking,Communication,Other,How often do you have to perform public speaki...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day
4,telephone,Communication,Other,How often do you have telephone conversations ...,Never,Once a year or more but not every month,Once a month or more but not every week,Once a week or more but not every day,Every day


In [15]:
output_filename = "../output/meta_data.csv"
meta_data_df.to_csv(output_filename, index=False)