# Imports

In [1]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

from collections import defaultdict

import common_functions
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import secret
import statsmodels.api as sm
import statsmodels.formula.api as smf
import utils
from matplotlib.ticker import FormatStrFormatter
from scipy import stats
from statannotations.Annotator import Annotator
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Aim of this notebook  


1. Describe POP levels in settled dust and wristband
2. Investigate correations among matrices measured



In [2]:
WB_DATA_PATH = utils.Configuration.RAW_DATA_PATH.joinpath(
    "e-Waste WB - Final Results (Results 2nd protocol).xlsx"
)

PROCESSED_DATA_PATH = utils.Configuration.INTERIM_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_INTERIM.parquet.gzip"
)

RAW_DATA_PATH = utils.Configuration.RAW_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_2022_11_23.xlsx"
)

## Read in wristband data, only get IDs we have serum data for and only keep workers

In [3]:
UA_measured_samples = (
    pd.read_parquet(PROCESSED_DATA_PATH)
    .reset_index()
    .rename(columns={"index": "ID"})[["ID", "main_category"]]
)

wristband = (
    pd.read_excel(
        WB_DATA_PATH,
        usecols="A, F:R",
        skiprows=[1, 2],
        nrows=118,
    )
    .rename(columns={"Unnamed: 0": "ID"})
    .assign(ID=lambda df: df.ID.str[2:-4])
    .replace("<1", 0.5)
    .replace("<10", 5)
    .loc[lambda df: df.ID.isin(UA_measured_samples.ID), :]
    .assign(
        main_category=lambda df: df.ID.map(
            dict(zip(UA_measured_samples.ID, UA_measured_samples.main_category))
        ),
        company_ID=lambda df: df.ID.str[:5],
    )
    .query("main_category == 'Worker'")
    .reset_index(drop=True)
)

wristband.to_parquet(
    utils.Configuration.INTERIM_DATA_PATH.joinpath("wristband.parquet.gzip"),
    compression="gzip",
)

#### Concentration distribution

In [4]:
(
    wristband.drop(columns=["company_ID", "main_category"])
    .set_index("ID")
    .describe()
    .round(1)
    .transpose()
    .loc[:, lambda x: x.columns.isin(["count", "25%", "50%", "75%"])]
)

Unnamed: 0,count,25%,50%,75%
PCB 101,79.0,0.5,8.1,19.6
PCB 118,79.0,2.0,4.6,11.4
PCB 153,79.0,1.5,3.5,8.3
PCB 138,79.0,1.9,5.3,14.8
PCB 180,79.0,0.5,1.1,3.4
BDE 28,79.0,0.5,0.5,0.5
BDE 47,79.0,0.5,2.9,13.6
BDE 100,79.0,0.5,0.5,2.4
BDE 99,79.0,0.5,0.5,16.2
BDE 154,79.0,0.5,0.5,0.8


#### Detection frequency

In [5]:
wristband_DF = (
    (
        1
        - wristband.select_dtypes("number")
        .round(3)
        .apply(lambda x: common_functions.is_detected(x, matrix="wristband"))
        .sum()
        .div(wristband.shape[0])
    )
    .mul(100)
    .to_frame(name="Wristband")
    .round(1)
)
wristband_DF

Unnamed: 0,Wristband
PCB 101,60.8
PCB 118,89.9
PCB 153,87.3
PCB 138,86.1
PCB 180,53.2
BDE 28,15.2
BDE 47,63.3
BDE 100,34.2
BDE 99,36.7
BDE 154,25.3


## Read in settled dust data

In [6]:
dust = (
    pd.read_excel(
        io=RAW_DATA_PATH,
        sheet_name="E-waste study Settled dust",
        usecols="A,BU : EV",
        skiprows=3,
        skipfooter=10,
    )
    .loc[3:, lambda df: ~df.columns.str.contains("Unnamed")]
    .loc[3:, lambda df: ~df.columns.isin(["Other PCB (please, name it here)"])]
    .reset_index(drop=True)
    .replace(
        {
            "<0.1": 0.05,
            "<1": 0.5,
            "<0.2": 0.1,
            "<10": 5,
        }
    )
    .dropna(how="any", axis=0)
    .assign(group=lambda df: df["Sample ID"].str[:-4])
    .loc[:, "PCB 28":]
    # .drop(columns="group")
    # .groupby("group")
    # .agg(np.mean)
)

dust.to_parquet(
    utils.Configuration.INTERIM_DATA_PATH.joinpath("dust.parquet.gzip"),
    compression="gzip",
)

#### Concentration distribution

In [7]:
(
    dust.describe()
    .transpose()
    .round(1)
    .loc[:, lambda x: x.columns.isin(["count", "25%", "50%", "75%"])]
)

Unnamed: 0,count,25%,50%,75%
PCB 28,43.0,0.0,0.1,1.4
PCB 52,43.0,0.0,0.1,0.9
PCB 101,43.0,0.0,0.1,0.5
PCB 118,43.0,0.0,0.1,0.5
PCB 138,43.0,0.0,0.0,0.3
PCB 153,43.0,0.0,0.0,0.4
PCB 180,43.0,0.0,0.0,0.1
BDE 28,43.0,0.0,0.0,0.0
BDE 47,43.0,0.0,0.2,0.5
BDE 99,43.0,0.0,0.3,0.5


#### Detection frequency

In [8]:
dust_DF = (
    (
        1
        - dust.select_dtypes("number")
        .round(3)
        .apply(lambda x: common_functions.is_detected(x, matrix="dust"))
        .sum()
        .div(dust.shape[0])
    )
    .mul(100)
    .to_frame(name="Dust")
    .round(1)
)
dust_DF

Unnamed: 0,Dust
PCB 28,58.1
PCB 52,55.8
PCB 101,53.5
PCB 118,53.5
PCB 138,41.9
PCB 153,44.2
PCB 180,34.9
BDE 28,16.3
BDE 47,60.5
BDE 99,72.1


## Correlation analysis
### Serum and dust


In [9]:
DATA_PATH = utils.Configuration.INTERIM_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_INTERIM.parquet.gzip"
)

serum = pd.read_parquet(DATA_PATH)

In [10]:
# Select specific columns from the 'serum' DataFrame
serum_dust = (
    serum.loc[
        :,
        lambda df: df.columns.isin(
            dust.columns.to_list() + ["main_category", "companyID"]
        ),
    ]
    # Filter rows where 'main_category' is 'Worker'
    .query("main_category == 'Worker'")
    # Perform data transformation on the resulting DataFrame
    .pipe(
        lambda df: df.assign(
            **{
                col + "_dust": lambda df, col=col:
                # Map values from 'companyID' using a dictionary created from 'dust'
                df.companyID.map(dict(zip(dust.group, dust[col])))
                for col in df.loc[
                    :, lambda df: df.columns.isin(dust.columns.to_list())
                ].columns
            }
        )
    )
    # Remove rows with any missing values
    .dropna(how="any")
    # Select columns from 'PCB 28' onwards
    .loc[:, "PCB 28":]
)

In [11]:
serum_dust_dict = {"correlation": [], "p_value": []}
for i in range(0, 15):
    print(serum_dust.iloc[:, [i, i + 15]].columns)

    serum_dust_dict["correlation"].append(
        stats.spearmanr(serum_dust.iloc[:, [i, i + 15]], nan_policy="omit")[0]
    )
    serum_dust_dict["p_value"].append(
        stats.spearmanr(serum_dust.iloc[:, [i, i + 15]], nan_policy="omit")[1]
    )

Index(['PCB 28', 'PCB 28_dust'], dtype='object')
Index(['PCB 52', 'PCB 52_dust'], dtype='object')
Index(['PCB 101', 'PCB 101_dust'], dtype='object')
Index(['PCB 118', 'PCB 118_dust'], dtype='object')
Index(['PCB 138', 'PCB 138_dust'], dtype='object')
Index(['PCB 153', 'PCB 153_dust'], dtype='object')
Index(['PCB 180', 'PCB 180_dust'], dtype='object')
Index(['BDE 28', 'BDE 28_dust'], dtype='object')
Index(['BDE 47', 'BDE 47_dust'], dtype='object')
Index(['BDE 99', 'BDE 99_dust'], dtype='object')
Index(['BDE 100', 'BDE 100_dust'], dtype='object')
Index(['BDE 153', 'BDE 153_dust'], dtype='object')
Index(['BDE 154', 'BDE 154_dust'], dtype='object')
Index(['BDE 183', 'BDE 183_dust'], dtype='object')
Index(['BDE 209', 'BDE 209_dust'], dtype='object')




In [12]:
(
    pd.DataFrame(serum_dust_dict, index=serum_dust.iloc[:, :15].columns)
    .round(4)
    .assign(
        p_value=lambda df: np.select(
            condlist=[df.p_value < 0.01, df.p_value < 0.05, df.p_value < 0.1],
            choicelist=["***", "**", "*"],
            default=df.p_value,
        )
    )
)

Unnamed: 0,correlation,p_value
PCB 28,,
PCB 52,,
PCB 101,0.1221,0.241
PCB 118,-0.0281,0.7882
PCB 138,-0.1238,0.2345
PCB 153,-0.2552,**
PCB 180,-0.1489,0.1519
BDE 28,0.156,0.1333
BDE 47,0.1047,0.3152
BDE 99,0.3164,***


### Serum and wristband

In [13]:
serum_wristband = (
    pd.merge(wristband, serum, left_on="ID", right_index=True)
    .loc[:, lambda df: df.columns.str.contains("PCB|BDE")]
    .loc[:, lambda df: df.columns.unique()]
    .dropna(how="any")
    .drop(columns=["PCB 28", "PCB 52"])
    .rename(columns=lambda x: x.replace("_x", "").replace("_y", ""))
)

serum_wristband_dict = {"correlation": [], "p_value": []}

for i in serum_wristband.columns.unique():
    print(serum_wristband.loc[:, i].columns)
    serum_wristband_dict["correlation"].append(
        stats.spearmanr(serum_wristband.loc[:, i], nan_policy="omit")[0],
    )
    serum_wristband_dict["p_value"].append(
        stats.spearmanr(serum_wristband.loc[:, i], nan_policy="omit")[1],
    )

(
    pd.DataFrame(serum_wristband_dict, index=serum_wristband.columns.unique())
    .sort_values(by="p_value")
    .round(4)
    .assign(
        p_value=lambda df: np.select(
            condlist=[df.p_value < 0.01, df.p_value < 0.05, df.p_value < 0.1],
            choicelist=["***", "**", "*"],
            default=df.p_value,
        )
    )
)

Index(['PCB 101', 'PCB 101'], dtype='object')
Index(['PCB 118', 'PCB 118'], dtype='object')
Index(['PCB 153', 'PCB 153'], dtype='object')
Index(['PCB 138', 'PCB 138'], dtype='object')
Index(['PCB 180', 'PCB 180'], dtype='object')
Index(['BDE 28', 'BDE 28'], dtype='object')
Index(['BDE 47', 'BDE 47'], dtype='object')
Index(['BDE 100', 'BDE 100'], dtype='object')
Index(['BDE 99', 'BDE 99'], dtype='object')
Index(['BDE 154', 'BDE 154'], dtype='object')
Index(['BDE 153', 'BDE 153'], dtype='object')
Index(['BDE 183', 'BDE 183'], dtype='object')
Index(['BDE 209', 'BDE 209'], dtype='object')




Unnamed: 0,correlation,p_value
BDE 153,-0.3231,***
PCB 101,0.3012,**
BDE 99,0.2447,**
PCB 153,-0.2373,**
BDE 100,0.1722,0.154
PCB 180,-0.1441,0.2341
BDE 154,-0.1415,0.2427
BDE 47,0.1352,0.2646
BDE 28,0.134,0.2688
BDE 209,0.0718,0.555


### Wristband and dust

In [14]:
wristband_dust = (
    wristband.pipe(
        lambda df: df.assign(
            **{
                col
                + "_dust": lambda df, col=col: df.company_ID.map(
                    dict(zip(dust.group, dust[col]))
                )
                for col in df.loc[
                    :, lambda df: df.columns.isin(dust.columns.to_list())
                ].columns
            }
        )
    )
    .select_dtypes("number")
    .rename(columns=lambda x: x.replace("_dust", ""))
    .loc[:, lambda df: df.columns.unique()]
    .iloc[:, :-2]
    .dropna(how="any")
)

wristband_dust_dict = {"correlation": [], "p_value": []}

for i in wristband_dust.columns.unique():
    print(wristband_dust.loc[:, i].columns)
    wristband_dust_dict["correlation"].append(
        stats.spearmanr(wristband_dust.loc[:, i], nan_policy="omit")[0]
    )
    wristband_dust_dict["p_value"].append(
        stats.spearmanr(wristband_dust.loc[:, i], nan_policy="omit")[1]
    )

(
    pd.DataFrame(wristband_dust_dict, index=wristband_dust.columns.unique())
    .sort_values(by="p_value")
    .assign(
        p_value=lambda df: np.select(
            condlist=[df.p_value < 0.01, df.p_value < 0.05, df.p_value < 0.1],
            choicelist=["***", "**", "*"],
            default=df.p_value,
        )
    )
    .round(2)
)

Index(['PCB 101', 'PCB 101'], dtype='object')
Index(['PCB 118', 'PCB 118'], dtype='object')
Index(['PCB 153', 'PCB 153'], dtype='object')
Index(['PCB 138', 'PCB 138'], dtype='object')
Index(['PCB 180', 'PCB 180'], dtype='object')
Index(['BDE 28', 'BDE 28'], dtype='object')
Index(['BDE 47', 'BDE 47'], dtype='object')
Index(['BDE 100', 'BDE 100'], dtype='object')
Index(['BDE 99', 'BDE 99'], dtype='object')
Index(['BDE 154', 'BDE 154'], dtype='object')
Index(['BDE 153', 'BDE 153'], dtype='object')
Index(['BDE 183', 'BDE 183'], dtype='object')




Unnamed: 0,correlation,p_value
BDE 154,0.76,***
PCB 180,0.71,***
PCB 138,0.69,***
BDE 99,0.65,***
BDE 100,0.63,***
BDE 183,0.63,***
PCB 153,0.6,***
BDE 47,0.57,***
BDE 153,0.5,***
PCB 118,0.42,***
