# Imports

In [None]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

from collections import defaultdict

import common_functions
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import secret
import statsmodels.api as sm
import statsmodels.formula.api as smf
import utils
from matplotlib.ticker import FormatStrFormatter
from scipy import stats
from statannotations.Annotator import Annotator
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Aim of this notebook  


1. Describe POP levels in settled dust and wristband
2. Investigate correations among matrices measured



In [None]:
WB_DATA_PATH = utils.Configuration.RAW_DATA_PATH.joinpath(
    "e-Waste WB - Final Results (Results 2nd protocol).xlsx"
)

PROCESSED_DATA_PATH = utils.Configuration.INTERIM_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_INTERIM.parquet.gzip"
)

RAW_DATA_PATH = utils.Configuration.RAW_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_2022_11_23.xlsx"
)

## Read in wristband data, only get IDs we have serum data for and only keep workers

In [None]:
UA_measured_samples = (
    pd.read_parquet(PROCESSED_DATA_PATH)
    .reset_index()
    .rename(columns={"index": "ID"})[["ID", "main_category"]]
)

wristband = (
    pd.read_excel(
        WB_DATA_PATH,
        usecols="A, F:R",
        skiprows=[1, 2],
        nrows=118,
    )
    .rename(columns={"Unnamed: 0": "ID"})
    .assign(ID=lambda df: df.ID.str[2:-4])
    .replace("<1", 0.5)
    .replace("<10", 5)
    .loc[lambda df: df.ID.isin(UA_measured_samples.ID), :]
    .assign(
        main_category=lambda df: df.ID.map(
            dict(zip(UA_measured_samples.ID, UA_measured_samples.main_category))
        ),
        company_ID=lambda df: df.ID.str[:5],
    )
    .query("main_category == 'Worker'")
    .reset_index(drop=True)
)

wristband.to_parquet(
    utils.Configuration.INTERIM_DATA_PATH.joinpath("wristband.parquet.gzip"),
    compression="gzip",
)

#### Concentration distribution

In [None]:
(
    wristband.drop(columns=["company_ID", "main_category"])
    .set_index("ID")
    .describe()
    .round(1)
    .transpose()
    .loc[:, lambda x: x.columns.isin(["count", "25%", "50%", "75%"])]
)

#### Detection frequency

In [None]:
wristband_DF = (
    (
        1
        - wristband.select_dtypes("number")
        .round(3)
        .apply(lambda x: common_functions.is_detected(x, matrix="wristband"))
        .sum()
        .div(wristband.shape[0])
    )
    .mul(100)
    .to_frame(name="Wristband")
    .round(1)
)
wristband_DF

## Read in settled dust data

In [None]:
dust = (
    pd.read_excel(
        io=RAW_DATA_PATH,
        sheet_name="E-waste study Settled dust",
        usecols="A,BU : EV",
        skiprows=3,
        skipfooter=10,
    )
    .loc[3:, lambda df: ~df.columns.str.contains("Unnamed")]
    .loc[3:, lambda df: ~df.columns.isin(["Other PCB (please, name it here)"])]
    .reset_index(drop=True)
    .replace(
        {
            "<0.1": 0.05,
            "<1": 0.5,
            "<0.2": 0.1,
            "<10": 5,
        }
    )
    .dropna(how="any", axis=0)
    .assign(group=lambda df: df["Sample ID"].str[:-4])
    .loc[:, "PCB 28":]
    # .drop(columns="group")
    # .groupby("group")
    # .agg(np.mean)
)

dust.to_parquet(
    utils.Configuration.INTERIM_DATA_PATH.joinpath("dust.parquet.gzip"),
    compression="gzip",
)

#### Concentration distribution

In [None]:
(
    dust.describe()
    .transpose()
    .round(1)
    .loc[:, lambda x: x.columns.isin(["count", "25%", "50%", "75%"])]
)

#### Detection frequency

In [None]:
dust_DF = (
    (
        1
        - dust.select_dtypes("number")
        .round(3)
        .apply(lambda x: common_functions.is_detected(x, matrix="dust"))
        .sum()
        .div(dust.shape[0])
    )
    .mul(100)
    .to_frame(name="Dust")
    .round(1)
)
dust_DF

## Correlation analysis
### Serum and dust


In [None]:
DATA_PATH = utils.Configuration.INTERIM_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_INTERIM.parquet.gzip"
)

serum = pd.read_parquet(DATA_PATH)

In [None]:
# Select specific columns from the 'serum' DataFrame
serum_dust = (
    serum.loc[
        :,
        lambda df: df.columns.isin(
            dust.columns.to_list() + ["main_category", "companyID"]
        ),
    ]
    # Filter rows where 'main_category' is 'Worker'
    .query("main_category == 'Worker'")
    # Perform data transformation on the resulting DataFrame
    .pipe(
        lambda df: df.assign(
            **{
                col + "_dust": lambda df, col=col:
                # Map values from 'companyID' using a dictionary created from 'dust'
                df.companyID.map(dict(zip(dust.group, dust[col])))
                for col in df.loc[
                    :, lambda df: df.columns.isin(dust.columns.to_list())
                ].columns
            }
        )
    )
    # Remove rows with any missing values
    .dropna(how="any")
    # Select columns from 'PCB 28' onwards
    .loc[:, "PCB 28":]
)

In [None]:
serum_dust_dict = {"correlation": [], "p_value": []}
for i in range(0, 15):
    print(serum_dust.iloc[:, [i, i + 15]].columns)

    serum_dust_dict["correlation"].append(
        stats.spearmanr(serum_dust.iloc[:, [i, i + 15]], nan_policy="omit")[0]
    )
    serum_dust_dict["p_value"].append(
        stats.spearmanr(serum_dust.iloc[:, [i, i + 15]], nan_policy="omit")[1]
    )

In [None]:
serum_dust_corr = (
    pd.DataFrame(serum_dust_dict, index=serum_dust.iloc[:, :15].columns)
    .round(4)
    .assign(
        p_value=lambda df: np.select(
            condlist=[df.p_value < 0.01, df.p_value < 0.05, df.p_value < 0.1],
            choicelist=["***", "**", "*"],
            default=df.p_value,
        )
    )
)
serum_dust_corr

### Serum and wristband

In [None]:
serum_wristband = (
    pd.merge(wristband, serum, left_on="ID", right_index=True)
    .loc[:, lambda df: df.columns.str.contains("PCB|BDE")]
    .loc[:, lambda df: df.columns.unique()]
    .dropna(how="any")
    .drop(columns=["PCB 28", "PCB 52"])
    .rename(columns=lambda x: x.replace("_x", "").replace("_y", ""))
)

serum_wristband_dict = {"correlation": [], "p_value": []}

for i in serum_wristband.columns.unique():
    print(serum_wristband.loc[:, i].columns)
    serum_wristband_dict["correlation"].append(
        stats.spearmanr(serum_wristband.loc[:, i], nan_policy="omit")[0],
    )
    serum_wristband_dict["p_value"].append(
        stats.spearmanr(serum_wristband.loc[:, i], nan_policy="omit")[1],
    )

serum_wristband_corr = (
    pd.DataFrame(serum_wristband_dict, index=serum_wristband.columns.unique())
    .sort_values(by="p_value")
    .round(4)
    .assign(
        p_value=lambda df: np.select(
            condlist=[df.p_value < 0.01, df.p_value < 0.05, df.p_value < 0.1],
            choicelist=["***", "**", "*"],
            default=df.p_value,
        )
    )
)
serum_wristband_corr

### Wristband and dust

In [None]:
wristband_dust = (
    wristband.pipe(
        lambda df: df.assign(
            **{
                col
                + "_dust": lambda df, col=col: df.company_ID.map(
                    dict(zip(dust.group, dust[col]))
                )
                for col in df.loc[
                    :, lambda df: df.columns.isin(dust.columns.to_list())
                ].columns
            }
        )
    )
    .select_dtypes("number")
    .rename(columns=lambda x: x.replace("_dust", ""))
    .loc[:, lambda df: df.columns.unique()]
    .iloc[:, :-2]
    .dropna(how="any")
)

wristband_dust_dict = {"correlation": [], "p_value": []}

for i in wristband_dust.columns.unique():
    print(wristband_dust.loc[:, i].columns)
    wristband_dust_dict["correlation"].append(
        stats.spearmanr(wristband_dust.loc[:, i], nan_policy="omit")[0]
    )
    wristband_dust_dict["p_value"].append(
        stats.spearmanr(wristband_dust.loc[:, i], nan_policy="omit")[1]
    )

wristband_dust_corr = (
    pd.DataFrame(wristband_dust_dict, index=wristband_dust.columns.unique())
    .sort_values(by="p_value")
    .assign(
        p_value=lambda df: np.select(
            condlist=[df.p_value < 0.01, df.p_value < 0.05, df.p_value < 0.1],
            choicelist=["***", "**", "*"],
            default=df.p_value,
        )
    )
    .round(2)
)
wristband_dust_corr

In [None]:
pd.concat(
    [serum_dust_corr, serum_wristband_corr, wristband_dust_corr],
    axis=1,
)