# Imports

In [1]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

from collections import defaultdict

import common_functions
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import secret
import statsmodels.api as sm
import statsmodels.formula.api as smf
import utils
from matplotlib.ticker import FormatStrFormatter
from scipy import stats
from statannotations.Annotator import Annotator
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Aim of this notebook  


1. Describe POP levels in settled dust and wristband 



In [2]:
WB_DATA_PATH = utils.Configuration.RAW_DATA_PATH.joinpath(
    "e-Waste WB - Final Results (Results 2nd protocol).xlsx"
)

PROCESSED_DATA_PATH = utils.Configuration.INTERIM_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_INTERIM.parquet.gzip"
)

RAW_DATA_PATH = utils.Configuration.RAW_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_2022_11_23.xlsx"
)

## Read in wristband data, only get IDs we have serum data for and only keep workers

In [3]:
UA_measured_samples = (
    pd.read_parquet(PROCESSED_DATA_PATH)
    .reset_index()
    .rename(columns={"index": "ID"})[["ID", "main_category"]]
)

wristband = (
    pd.read_excel(
        WB_DATA_PATH,
        usecols="A, F:R",
        skiprows=[1, 2],
        nrows=118,
    )
    .rename(columns={"Unnamed: 0": "ID"})
    .assign(ID=lambda df: df.ID.str[2:-4])
    .replace("<1", 0.5)
    .replace("<10", 5)
    .loc[lambda df: df.ID.isin(UA_measured_samples.ID), :]
    .assign(
        main_category=lambda df: df.ID.map(
            dict(zip(UA_measured_samples.ID, UA_measured_samples.main_category))
        ),
        company_ID=lambda df: df.ID.str[:5],
    )
    # .query("main_category == 'Worker'")
    .reset_index(drop=True)
)

#### Concentration distribution
##### Aggregated

In [4]:
(
    wristband.drop(columns=["company_ID", "main_category"])
    .set_index("ID")
    .describe()
    .round(1)
    .transpose()
    .loc[:, lambda x: x.columns.isin(["count", "25%", "50%", "75%"])]
)

Unnamed: 0,count,25%,50%,75%
PCB 101,98.0,0.5,3.7,17.1
PCB 118,98.0,1.2,2.9,9.8
PCB 153,98.0,1.3,2.6,6.0
PCB 138,98.0,1.4,3.5,9.9
PCB 180,98.0,0.5,0.5,2.1
BDE 28,98.0,0.5,0.5,0.5
BDE 47,98.0,0.5,1.7,10.4
BDE 100,98.0,0.5,0.5,1.7
BDE 99,98.0,0.5,0.5,5.6
BDE 154,98.0,0.5,0.5,0.5


##### Main_category specific

In [5]:
(
    wristband.drop(columns=["company_ID"])
    .set_index("ID")
    .groupby("main_category")
    .quantile([0.25, 0.5, 0.75])
    .transpose()
    .round(1)
)

main_category,Control,Control,Control,Worker,Worker,Worker
Unnamed: 0_level_1,0.25,0.50,0.75,0.25,0.50,0.75
PCB 101,0.5,0.5,0.5,0.5,8.1,19.6
PCB 118,0.5,0.5,1.2,2.0,4.6,11.4
PCB 153,0.5,1.2,1.6,1.5,3.5,8.3
PCB 138,0.5,1.1,2.2,1.9,5.3,14.8
PCB 180,0.5,0.5,0.5,0.5,1.1,3.4
BDE 28,0.5,0.5,0.5,0.5,0.5,0.5
BDE 47,0.5,0.5,1.3,0.5,2.9,13.6
BDE 100,0.5,0.5,0.5,0.5,0.5,2.4
BDE 99,0.5,0.5,0.5,0.5,0.5,16.2
BDE 154,0.5,0.5,0.5,0.5,0.5,0.8


#### Detection frequency

In [6]:
wristband_DF = (
    (
        1
        - wristband.select_dtypes("number")
        .round(3)
        .apply(lambda x: common_functions.is_detected(x, matrix="wristband"))
        .sum()
        .div(wristband.shape[0])
    )
    .mul(100)
    .to_frame(name="Wristband")
    .round(1)
)
wristband_DF

Unnamed: 0,Wristband
PCB 101,52.0
PCB 118,79.6
PCB 153,81.6
PCB 138,80.6
PCB 180,44.9
BDE 28,13.3
BDE 47,57.1
BDE 100,28.6
BDE 99,30.6
BDE 154,21.4


## Read in settled dust data

In [7]:
dust = (
    pd.read_excel(
        io=RAW_DATA_PATH,
        sheet_name="E-waste study Settled dust",
        usecols="A,BU : EV",
        skiprows=3,
        skipfooter=10,
    )
    .loc[3:, lambda df: ~df.columns.str.contains("Unnamed")]
    .loc[3:, lambda df: ~df.columns.isin(["Other PCB (please, name it here)"])]
    .reset_index(drop=True)
    .replace(
        {
            "<0.1": 0.05,
            "<1": 0.5,
            "<0.2": 0.1,
            "<10": 5,
        }
    )
    .dropna(how="any", axis=0)
    .assign(group=lambda df: df["Sample ID"].str[:-4])
    .loc[:, "PCB 28":]
    .groupby("group")
    .agg(np.mean)
)

#### Concentration distribution

In [8]:
(
    dust
    # .drop(columns="group")
    .describe()
    .transpose()
    .round(1)
    .loc[:, lambda x: x.columns.isin(["count", "25%", "50%", "75%"])]
)

Unnamed: 0,count,25%,50%,75%
PCB 28,11.0,0.2,0.5,2.0
PCB 52,11.0,0.1,0.3,1.0
PCB 101,11.0,0.1,0.3,1.0
PCB 118,11.0,0.1,0.3,1.1
PCB 138,11.0,0.1,0.2,0.4
PCB 153,11.0,0.1,0.2,0.5
PCB 180,11.0,0.0,0.1,0.4
BDE 28,11.0,0.0,0.1,0.1
BDE 47,11.0,0.1,0.2,0.6
BDE 99,11.0,0.3,0.3,0.8


#### Detection frequency

In [9]:
dust_DF = (
    (
        1
        - dust.select_dtypes("number")
        .round(3)
        .apply(lambda x: common_functions.is_detected(x, matrix="dust"))
        .sum()
        .div(dust.shape[0])
    )
    .mul(100)
    .to_frame(name="Dust")
    .round(1)
)
dust_DF

Unnamed: 0,Dust
PCB 28,81.8
PCB 52,81.8
PCB 101,72.7
PCB 118,72.7
PCB 138,72.7
PCB 153,72.7
PCB 180,63.6
BDE 28,54.5
BDE 47,81.8
BDE 99,90.9


## Correlation analysis
### Serum and dust


In [10]:
DATA_PATH = utils.Configuration.INTERIM_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_INTERIM.parquet.gzip"
)

serum = pd.read_parquet(DATA_PATH)

In [11]:
# Select specific columns from the 'serum' DataFrame
serum_dust = (
    serum.loc[
        :,
        lambda df: df.columns.isin(
            dust.columns.to_list() + ["main_category", "companyID"]
        ),
    ]
    # Filter rows where 'main_category' is 'Worker'
    .query("main_category == 'Worker'")
    # Perform data transformation on the resulting DataFrame
    .pipe(
        lambda df: df.assign(
            **{
                col + "_dust": lambda df, col=col:
                # Map values from 'companyID' using a dictionary created from 'dust'
                df.companyID.map(dict(zip(dust.index, dust[col])))
                for col in df.loc[
                    :, lambda df: df.columns.isin(dust.columns.to_list())
                ].columns
            }
        )
    )
    # Remove rows with any missing values
    .dropna(how="any")
    # Select columns from 'PCB 28' onwards
    .loc[:, "PCB 28":]
)

In [12]:
serum_dust_dict = {"correlation": [], "p_value": []}
for i in range(0, 15):
    print(serum_dust.iloc[:, [i, i + 15]].columns)

    serum_dust_dict["correlation"].append(
        stats.spearmanr(serum_dust.iloc[:, [i, i + 15]], nan_policy="omit")[0]
    )
    serum_dust_dict["p_value"].append(
        stats.spearmanr(serum_dust.iloc[:, [i, i + 15]], nan_policy="omit")[1]
    )

Index(['PCB 28', 'PCB 28_dust'], dtype='object')
Index(['PCB 52', 'PCB 52_dust'], dtype='object')
Index(['PCB 101', 'PCB 101_dust'], dtype='object')
Index(['PCB 118', 'PCB 118_dust'], dtype='object')
Index(['PCB 138', 'PCB 138_dust'], dtype='object')
Index(['PCB 153', 'PCB 153_dust'], dtype='object')
Index(['PCB 180', 'PCB 180_dust'], dtype='object')
Index(['BDE 28', 'BDE 28_dust'], dtype='object')
Index(['BDE 47', 'BDE 47_dust'], dtype='object')
Index(['BDE 99', 'BDE 99_dust'], dtype='object')
Index(['BDE 100', 'BDE 100_dust'], dtype='object')
Index(['BDE 153', 'BDE 153_dust'], dtype='object')
Index(['BDE 154', 'BDE 154_dust'], dtype='object')
Index(['BDE 183', 'BDE 183_dust'], dtype='object')
Index(['BDE 209', 'BDE 209_dust'], dtype='object')




In [13]:
(pd.DataFrame(serum_dust_dict, index=serum_dust.iloc[:, :15].columns).round(4))

Unnamed: 0,correlation,p_value
PCB 28,,
PCB 52,,
PCB 101,0.0838,0.4221
PCB 118,-0.1033,0.3218
PCB 138,-0.0485,0.6427
PCB 153,-0.0854,0.4133
PCB 180,-0.1391,0.1811
BDE 28,0.0607,0.5609
BDE 47,0.1657,0.1105
BDE 99,0.2696,0.0086
