# Imports

In [1]:
import warnings

import common_functions
import numpy as np
import pandas as pd
import utils

# Constructing the main dataframe

## Reading in workers data from E-waste recycling sheet

### Dealing RPE, PPE and such columns

In [2]:
RAW_FILE_LOCATION = utils.Configuration.RAW_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_2022_11_23.xlsx"
)

# Read the Excel file once
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    raw_df = pd.read_excel(
        RAW_FILE_LOCATION,
        sheet_name="E-waste recycling",
        skiprows=4,
        skipfooter=7,
    )

# Define a list of column prefixes to filter
column_prefixes = [
    "Takes place at  the company site",
    "Use of other PPE",
    "Use of RPE",
    "Enclosed process",
    "Local exhaust ventilation",
    "Job",
]

# Create a dictionary to store the results
result_dict = {}

# Perform data transformation for each column prefix
for prefix in column_prefixes:
    filtered_data = raw_df.filter(like=prefix)
    transformed_data = common_functions.transform_data(filtered_data)

    if "Job" in prefix:
        # Handle special case for 'Job' columns
        transformed_data = (
            transformed_data.loc[2:, :]
            .reset_index(drop=True)
            .replace({"Yes": 1, "No": 0, "no": 0})
            .rename(columns=lambda x: x[:6].replace(":", "").replace(" ", ""))
            .rename(columns={"Operat": "Job1"})
        )

    result_dict[prefix] = transformed_data

### Determining the number of years worked

In [3]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    years_worked_workers = pd.read_excel(
        RAW_FILE_LOCATION,
        sheet_name="E-waste recycling",
        usecols="JK:KI",
        skiprows=5,
        skipfooter=7,
    )


years_worked_workers = (
    years_worked_workers.filter(like="Year")
    .pipe(lambda df_: common_functions.col_name_changer(df_, what=".", how="_"))
    .assign(years_worked=lambda df_: common_functions.count_years_worked(df_))
    .loc[:, "years_worked"]
)

### Read in main workers dataframe and concat with RPE and years_worked

In [4]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    workers_raw = pd.read_excel(
        RAW_FILE_LOCATION,
        sheet_name="E-waste recycling",
        usecols="B, H, G, GA, GB, GC, GE, GG, GK, GR: GU, GV:HA",
        skiprows=5,
        skipfooter=7,
    )

workers_raw = workers_raw.rename(
    columns={
        "ID": "companyID",
        "ID.1": "ID",
        "Number.30": "height",
        "Number.31": "weight",
        "Hours": "shift_duration",
        "Place.1": "home_location",
        "Yes / No.83": "industrial_plants_in_surroundings",
        "km": "how_many_km",
        "Density": "vehicular_traffic_density",
        "Yes / No.84": "cigarette_smoking",
        "Number.33": "cigarettes_per_day",
        "Duration": "years_smoked",
        "Years.5": "former_smoker_years_ago_stopped",
        "Per day": "former_smoker_cigatette_a_day",
        "Duration.1": "former_smoker_for_how_many_years",
    }
)

workers_raw = pd.concat(
    [pd.concat(result_dict, axis="columns"), workers_raw], axis="columns"
)


workers_raw = workers_raw.assign(
    Category=lambda df_: df_.Category.str.replace(
        r"([0-9])", "", regex=True
    ).str.replace(".", "", regex=True),
    Age=lambda df_: df_.Age.astype(int),
    height=lambda df_: df_.height.astype(str).str.extract(r"([0-9.]+)").astype(float),
    weight=lambda df_: df_.weight.astype(str).str.extract(r"([0-9.]+)").astype(float),
    BMI=lambda df_: df_.weight.div((df_.height / 100) ** 2),
    how_many_km=lambda df_: df_.how_many_km.replace(r"-", np.nan).astype(float),
).assign(
    how_many_km=lambda df_: df_.how_many_km.mask(df_.how_many_km > 10, np.nan),
    industrial_plants_in_surroundings=lambda df_: df_.industrial_plants_in_surroundings.replace(
        {"Yes": True, "No": False}
    ).mask(
        df_.how_many_km.isna(), df_.industrial_plants_in_surroundings == False
    ),
)

## Reading in controls data from E-waste recycling sheet

In [7]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    years_worked_controls = pd.read_excel(
        RAW_FILE_LOCATION,
        sheet_name="Controls",
        usecols="JK:KH",
        skiprows=5,
    )

years_worked_controls = (
    years_worked_controls.filter(like="Year")
    .pipe(lambda df_: common_functions.col_name_changer(df_, what=".", how="_"))
    .assign(years_worked=lambda df_: common_functions.count_years_worked(df_))
    .loc[:, "years_worked"]
)

In [8]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)

    controls_raw = pd.read_excel(
        RAW_FILE_LOCATION,
        sheet_name="Controls",
        usecols="B, G, FZ, GA, GB, GD, GF, GJ,GQ : GZ",
        skiprows=5,
    )
controls_raw = controls_raw.rename(
    columns={
        "ID": "companyID",
        "ID.1": "ID",
        "Number.30": "height",
        "Number.31": "weight",
        "Hours": "shift_duration",
        "Place.1": "home_location",
        "Yes / No.83": "industrial_plants_in_surroundings",
        "km": "how_many_km",
        "Density": "vehicular_traffic_density",
        "Yes / No.84": "cigarette_smoking",
        "Number.33": "cigarettes_per_day",
        "Duration": "years_smoked",
        "Years.5": "former_smoker_years_ago_stopped",
        "Per day": "former_smoker_cigatette_a_day",
        "Duration.1": "former_smoker_for_how_many_years",
    }
)
controls_raw = pd.concat([controls_raw, years_worked_controls], axis="columns")


controls_raw = controls_raw.assign(
    Age=lambda df_: df_.Age.astype(int),
    BMI=lambda df_: df_.weight.div((df_.height / 100) ** 2),
    how_many_km=lambda df_: df_.how_many_km.astype(str)
    .str.extract(r"([0-9.]+)")
    .astype(float),
).assign(
    how_many_km=lambda df_: df_.how_many_km.mask(df_.how_many_km > 10, np.nan),
    industrial_plants_in_surroundings=lambda df_: df_.industrial_plants_in_surroundings.replace(
        {"Yes": True, "No": False}
    ).mask(
        df_.how_many_km.isna(), df_.industrial_plants_in_surroundings == False
    ),
)

## Combining the workers_raw and controls_raw datasets