# Imports

In [None]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FormatStrFormatter
from PIL import Image, ImageDraw, ImageFont
from statannotations.Annotator import Annotator

import common_functions
import utils

# Aim of this notebook  


1. **Describe Key Population Statistics:** Provide an overview of the primary variables related to population statistics.
2. **Explore Categorical Stratification:** Investigate the data using both main categorical stratification and sub-category based stratification.
3. **Analyze Serum POPs Distribution:** Describe the distribution of Persistent Organic Pollutants (POPs) in serum.
4. **Perform Statistical Analysis for the Serum POPs concentrations:** According to main, and sub-category specific stratification



In [None]:
DATA_PATH = utils.Configuration.INTERIM_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_INTERIM.parquet.gzip"
)

df = pd.read_parquet(DATA_PATH).rename(columns=lambda x: x.replace("PCB", "CB"))

## Characterizing the population based on age, BMI, shift duration, number of individuals per group and number of years worked

In [None]:
(
    pd.concat(
        [
            (
                df.groupby(["main_category", "sub_category"])
                .agg(
                    age_mean=("Age", "mean"),
                    age_std=("Age", "std"),
                    BMI_mean=("BMI", "mean"),
                    BMI_std=("BMI", "std"),
                    shift_duration_mean=("shift_duration", "mean"),
                    shift_duration_std=("shift_duration", "std"),
                    years_worked_mean=("years_worked", "mean"),
                    years_worked_std=("years_worked", "std"),
                )
                .round(1)
                .assign(
                    Age=lambda df: df["age_mean"].astype(str)
                    + " ("
                    + df["age_std"].astype(str)
                    + ")"
                )
                .assign(
                    BMI=lambda df: df["BMI_mean"].astype(str)
                    + " ("
                    + df["BMI_std"].astype(str)
                    + ")"
                )
                .assign(
                    Shift_duration=lambda df: df["shift_duration_mean"].astype(str)
                    + " ("
                    + df["shift_duration_std"].astype(str)
                    + ")"
                )
                .assign(
                    Years_worked=lambda df: df["years_worked_mean"].astype(str)
                    + " ("
                    + df["years_worked_std"].astype(str)
                    + ")"
                )
                .loc[:, "Age":]
            ),
            (df.groupby(["main_category", "sub_category", "Sex"]).size().unstack()),
        ],
        axis=1,
    )
    .assign(
        Total_number=lambda df: (df.Female + df.Male).astype(str)
        + " ("
        + df.Female.astype(str)
        + "F/"
        + df.Male.astype(str)
        + "M)"
    )
    .drop(columns=["Male", "Female"])
    .sort_index(axis=1)
)

# Describe Key Population Statistics

## Based on main_categories

In [None]:
numeric_columns_1 = (
    df.select_dtypes("number")
    .sort_index(axis="columns")
    .loc[
        :,
        [
            "Age",
            "BMI",
            "height",
            "weight",
            "years_worked",
            "shift_duration",
            "years_smoked",
            "cigarettes_per_day",
            "how_many_km",
        ],
    ]
)


fig = plt.figure(figsize=(8, 4))
rows, cols = 2, 5

for idx, i in enumerate(numeric_columns_1.columns):
    pairs, p_values = common_functions.get_pairs_values_for_mannwhitneyu(
        data=df, value_var=i, id_vars=["Worker", "Control"], col="main_category"
    )

    fig.add_subplot(rows, cols, idx + 1)
    (
        df.loc[:, lambda df_: ["main_category", i]]
        .rename(columns=lambda df_: df_.replace("_", " ").title())
        .replace("_", " ", regex=True)
        .pipe(
            lambda df_: common_functions.return_sns_box(
                df=df_, x=df_["Main Category"], y=df[i], ax=fig.axes[idx]
            )
        )
    )

    fig.axes[idx].set_title(i.replace("_", " ").title())
    fig.axes[idx].set_xlabel("")
    fig.axes[idx].set_ylabel("")
    fig.axes[idx].grid(False)
    plt.xticks(rotation=90)
    fig.tight_layout()

    if p_values[0] < 0.05:
        annotator = Annotator(
            fig.axes[idx], pairs=pairs, data=df, x="main_category", y=i, verbose=False
        )
        annotator.configure(text_format="star", loc="inside", line_width=1, test=None)
        annotator.set_pvalues_and_annotate(p_values)
    else:
        continue

plt.suptitle("")

## Based on sub_categories

In [None]:
fig = plt.figure(figsize=(12, 7))
rows, cols = 2, 5

for idx, i in enumerate(numeric_columns_1.columns):
    pairs, p_values = common_functions.get_pairs_values_for_posthoc_dunn(
        data=df, value_vars=i, id_vars="sub_category"
    )

    fig.add_subplot(rows, cols, idx + 1)
    (
        df.loc[:, lambda df_: ["sub_category", i]]
        .rename(columns=lambda df_: df_.replace("_", " ").title())
        .replace("_", " ", regex=True)
        .pipe(
            lambda df_: common_functions.return_sns_box(
                df=df_, x=df_["Sub Category"], y=df[i], ax=fig.axes[idx]
            )
        )
    )

    fig.axes[idx].set_title(i.replace("_", " ").title())
    fig.axes[idx].set_xlabel("")
    fig.axes[idx].set_ylabel("")
    fig.axes[idx].grid(False)
    plt.xticks(rotation=90)
    fig.tight_layout()

    if len(pairs) >= 1:
        annotator = Annotator(
            fig.axes[idx], pairs, data=df, x="sub_category", y=i, verbose=False
        )
        annotator.configure(text_format="star", loc="inside", line_width=1, test=None)
        annotator.set_pvalues_and_annotate(p_values)
    else:
        continue

plt.suptitle("")

# Analyze Serum POPs Distribution

## Detection frequencies of serum POPs

### Aggregated

In [None]:
# We need to read in our concentration data again to calculate detection frequencies, since the missing values were converted to LOD/2 for easier processing

RAW_DATA_PATH = utils.Configuration.RAW_DATA_PATH.joinpath(
    "HBM4EU_E-waste_template_V3_all_data_2022_11_23.xlsx"
)

UA_POP_raw = pd.read_excel(
    RAW_DATA_PATH,
    sheet_name="E-waste study UA data",
    skiprows=3,
    nrows=172,
)

columns_to_keep = [
    "Worker ID (without letter 'E')",
    "PCB 28",
    "PCB 52",
    "PCB 101",
    "PCB 118",
    "PCB 138",
    "PCB 153",
    "PCB 180",
    "BDE 28 [2,4,4′-Tribromodiphenyl ether]",
    "BDE 47 [2,2′,4,4′-tetrabromodiphenyl ether]",
    "BDE 99",
    "BDE 100",
    "BDE 153",
    "BDE 154",
    "BDE 183",
    "BDE 209",
    "Dechlorane",
]

UA_POP_raw = (
    UA_POP_raw.loc[3:, lambda df_: ~df_.columns.str.contains("Unnamed")]
    .reset_index(drop=True)
    .loc[:, lambda df_: df_.columns.isin(columns_to_keep)]
    .rename(
        columns={
            "BDE 28 [2,4,4′-Tribromodiphenyl ether]": "BDE 28",
            "BDE 47 [2,2′,4,4′-tetrabromodiphenyl ether]": "BDE 47",
            "Worker ID (without letter 'E')": "Worker_ID",
        }
    )
    .set_index("Worker_ID")
    .apply(pd.to_numeric, errors="coerce")
    .rename(columns=lambda x: x.replace("PCB", "CB"))
)
UA_POP_raw

In [None]:
df_detection_frequency = UA_POP_raw.merge(
    df[["main_category", "sub_category"]], left_index=True, right_index=True
)
df_detection_frequency.shape

In [None]:
aggregated_detection_frequency = (
    df_detection_frequency.loc[:, :"Dechlorane"]
    .notna()
    .sum()
    .div(UA_POP_raw.shape[0])
    .mul(100)
    .round(1)
)
aggregated_detection_frequency

### Main-category specific

In [None]:
main_category_detection_frequency = (
    df_detection_frequency.drop(columns="sub_category")
    .pipe(lambda df: df.assign(**{col: df[col].notna() for col in df.columns[:-1]}))
    .groupby("main_category")
    .sum()
    .div(df_detection_frequency.main_category.value_counts(), axis=0)
    .transpose()
    .mul(100)
    .round(1)
)
main_category_detection_frequency

### Sub-category specific

In [None]:
sub_category_detection_frequency = (
    df_detection_frequency.drop(columns="main_category")
    .pipe(lambda df: df.assign(**{col: df[col].notna() for col in df.columns[:-1]}))
    .groupby("sub_category")
    .sum()
    .div(df_detection_frequency.sub_category.value_counts(), axis=0)
    .transpose()
    .mul(100)
    .round(1)
)
sub_category_detection_frequency

## Distribution of pollutants in serum

### Based on main_category stratification

In [None]:
(
    df.loc[:, "main_category":]
    .drop(columns="sub_category")
    .groupby("main_category")
    .quantile([0.25, 0.5, 0.75])
    .transpose()
    .round(1)
)

### Based on sub_category stratification

In [None]:
(
    df.loc[:, "sub_category":]
    .groupby("sub_category")
    .quantile([0.25, 0.5, 0.75])
    .transpose()
    .round(1)
)

## Statistical anaysis of serum POP concentrations
### Based on main_category stratification

In [None]:
sns.set_context("paper", font_scale=1.4)

fig = plt.figure(figsize=(4, 10))
rows, cols = 2, 2

over50_DF = aggregated_detection_frequency[aggregated_detection_frequency > 50].index

for idx, i in enumerate(df.columns[df.columns.isin(over50_DF)]):
    pairs, p_values = common_functions.get_pairs_values_for_mannwhitneyu(
        data=df, value_var=i, id_vars=["Worker", "Control"], col="main_category"
    )
    iqr_mask = (df[i] > df[i].quantile(0.25)) & (df[i] < df[i].quantile(0.75))
    fig.add_subplot(rows, cols, idx + 1)

    PROPS = {
        "boxprops": {"facecolor": "white", "edgecolor": "black"},
        "medianprops": {"color": "black"},
        "whiskerprops": {"color": "black"},
        "capprops": {"color": "black"},
    }

    (
        df.loc[:, lambda df_: ["main_category", i]].pipe(
            lambda df_: sns.boxplot(
                x=df_["main_category"],
                y=df[i],
                width=0.5,
                showfliers=False,
                **PROPS,
                ax=fig.axes[idx],
            )
        )
    )
    fig.axes[idx].set_title(i, fontweight="bold", fontsize=14)
    fig.axes[idx].set_xlabel("")
    fig.axes[idx].set_ylabel("Concentration (ng/L)")
    fig.axes[idx].grid(False)
    fig.axes[idx].yaxis.set_major_formatter(FormatStrFormatter("%.0d"))
    plt.xticks()
    plt.xticks(rotation=90)
    fig.tight_layout()

    if p_values[0] < 0.05:
        annotator = Annotator(
            fig.axes[idx], pairs=pairs, data=df, x="main_category", y=i, verbose=False
        )
        annotator.configure(text_format="star", loc="inside", line_width=1, test=None)
        annotator.set_pvalues_and_annotate(p_values)
    else:
        continue

plt.suptitle("")
plt.savefig(utils.Configuration.PLOTS.joinpath("img1.png"), dpi=600)

### Based on sub_category stratification

In [None]:
sns.set_context("paper", font_scale=1.4)

fig = plt.figure(figsize=(10, 12))
rows, cols = 2, 2

for idx, i in enumerate(df.columns[df.columns.isin(over50_DF)]):
    pairs, p_values = common_functions.get_pairs_values_for_posthoc_dunn(
        data=df, value_vars=i, id_vars="sub_category", p_adjust="fdr_bh"
    )
    PROPS = {
        "boxprops": {"facecolor": "white", "edgecolor": "black"},
        "medianprops": {"color": "black"},
        "whiskerprops": {"color": "black"},
        "capprops": {"color": "black"},
    }

    fig.add_subplot(rows, cols, idx + 1)
    (
        df.query(
            "sub_category.isin(['Brown goods', 'Metals and plastics', 'Miscellaneous', 'White goods', 'Batteries', 'outwith_CTR', 'within_CTR'])"
        )
        .loc[:, lambda df_: ["sub_category", i]]
        .rename(columns=lambda df_: df_.replace("_", " ").title())
        .replace("_", " ", regex=True)
        .replace(
            {
                "outwith CTR": "Outwith CTR",
                "within CTR": "Within CTR",
            }
        )
        .pipe(
            lambda df_: sns.boxplot(
                x=df_["Sub Category"],
                y=df[i],
                showfliers=False,
                width=0.5,
                order=[
                    "Batteries",
                    "Brown goods",
                    "Metals and plastics",
                    "Miscellaneous",
                    "White goods",
                    "Outwith CTR",
                    "Within CTR",
                ],
                **PROPS,
                ax=fig.axes[idx],
            )
        )
    )

    fig.axes[idx].set_title(i, fontweight="bold", fontsize=18)
    fig.axes[idx].set_xlabel("")
    fig.axes[idx].set_ylabel("Concentration (ng/L)")
    fig.axes[idx].grid(False)
    fig.axes[idx].yaxis.set_major_formatter(FormatStrFormatter("%.0d"))
    plt.xticks(rotation=45, ha="right")
    fig.tight_layout()

    if len(pairs) >= 1:
        annotator = Annotator(
            fig.axes[idx],
            pairs,
            data=df.query(
                "sub_category.isin(['Brown goods', 'Metals and plastics', 'Miscellaneous', 'White goods', 'Batteries', 'outwith_CTR', 'within_CTR'])"
            ),
            x="sub_category",
            y=i,
            order=[
                "Batteries",
                "Brown goods",
                "Metals and plastics",
                "Miscellaneous",
                "White goods",
                "outwith_CTR",
                "within_CTR",
            ],
            verbose=False,
        )
        annotator.configure(text_format="star", loc="inside", line_width=1, test=None)
        annotator.set_pvalues_and_annotate(p_values)
    else:
        continue
plt.suptitle("")
plt.savefig(utils.Configuration.PLOTS.joinpath("img2.png"), dpi=600)

In [None]:
# Open the images
img1 = Image.open(utils.Configuration.PLOTS.joinpath("img1.png"))
img2 = Image.open(utils.Configuration.PLOTS.joinpath("img2.png"))

# Resize image2 to match the height of image1, keeping aspect ratio
img2 = img2.resize((int(img2.size[0] * img1.size[1] / img2.size[1]), img1.size[1]))

# Create a new image with width being the sum of both image widths
new_img = Image.new("RGB", (img1.width + img2.width, img1.height))

# Paste the images
new_img.paste(img1, (0, 0))
new_img.paste(img2, (img1.width, 0))

In [None]:
draw = ImageDraw.Draw(new_img)
font = ImageFont.truetype(r"C:\Windows\Fonts\Arial.ttf", 162)
draw.text((140, 100), "A)", fill="black", font=font)
draw.text((2500, 100), "B)", fill="black", font=font)
new_img.show()

In [None]:
new_img.save(utils.Configuration.PLOTS.joinpath("meged_figure.png"))