In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from src.config import CONFIG
from src.data import preprocess_annotations
from style import COLORS, color_palette

In [None]:
exec(open("style.py").read())
color_palette()

In [None]:
train_annotations = preprocess_annotations(
    pd.read_csv(CONFIG.PATH_TRS_AN), normalize=False
)
test_annotations = preprocess_annotations(
    pd.read_csv(CONFIG.PATH_TS_AN), normalize=False
)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

axs[0].bar(
    np.array([0, 1]) - 0.2,
    [
        1 - train_annotations["GENDER"].to_numpy().sum() / len(train_annotations),
        train_annotations["GENDER"].to_numpy().sum() / len(train_annotations),
    ],
    label="train",
    width=0.4,
    color=COLORS[1],
)
axs[0].bar(
    np.array([0, 1]) + 0.2,
    [
        1 - test_annotations["GENDER"].to_numpy().sum() / len(test_annotations),
        test_annotations["GENDER"].to_numpy().sum() / len(test_annotations),
    ],
    label="test",
    width=0.4,
    color=COLORS[2],
)
axs[0].set_xlabel("Gender")
axs[0].set_ylabel("Density")
axs[0].set_xticks([0, 1], ["Male", "Female"])
axs[0].set_ylim(0, 0.8)
axs[0].legend()

bins = np.linspace(
    train_annotations["LYMPH_COUNT"].min(), train_annotations["LYMPH_COUNT"].max(), 20
)
axs[1].hist(
    [
        train_annotations["LYMPH_COUNT"].to_numpy(),
        test_annotations["LYMPH_COUNT"].to_numpy(),
    ],
    label=["train", "test"],
    bins=bins,
    color=[COLORS[1], COLORS[2]],
    density=True,
)
axs[1].set_xlabel("Lymphocyte count")
axs[1].set_ylabel("Density")
axs[1].legend()

bins = np.linspace(train_annotations["AGE"].min(), train_annotations["AGE"].max(), 20)
axs[2].hist(
    [
        train_annotations["AGE"].to_numpy(),
        test_annotations["AGE"].to_numpy(),
    ],
    label=["train", "test"],
    bins=bins,
    color=[COLORS[1], COLORS[2]],
    density=True,
)
axs[2].set_xlabel("Age")
axs[2].set_ylabel("Density")
axs[2].legend()

plt.savefig("report/figures/train_test_histograms.png", dpi=300)
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

train_positive = train_annotations[train_annotations["LABEL"] == 1]
train_negative = train_annotations[train_annotations["LABEL"] == 0]

axs[0].bar(
    np.array([0, 1]) - 0.2,
    [
        1 - train_positive["GENDER"].to_numpy().sum() / len(train_positive),
        train_positive["GENDER"].to_numpy().sum() / len(train_positive),
    ],
    label="positive",
    width=0.4,
    color=COLORS[1],
)
axs[0].bar(
    np.array([0, 1]) + 0.2,
    [
        1 - train_negative["GENDER"].to_numpy().sum() / len(train_negative),
        train_negative["GENDER"].to_numpy().sum() / len(train_negative),
    ],
    label="negative",
    width=0.4,
    color=COLORS[2],
)
axs[0].set_xlabel("Gender")
axs[0].set_ylabel("Density")
axs[0].set_xticks([0, 1], ["Male", "Female"])
axs[0].set_ylim(0, 0.8)
axs[0].legend()

bins = np.linspace(
    train_positive["LYMPH_COUNT"].min(), train_positive["LYMPH_COUNT"].max(), 20
)
axs[1].hist(
    [
        train_positive["LYMPH_COUNT"].to_numpy(),
        train_negative["LYMPH_COUNT"].to_numpy(),
    ],
    label=["positive", "negative"],
    bins=bins,
    color=[COLORS[1], COLORS[2]],
    density=True,
)
axs[1].set_xlabel("Lymphocyte count")
axs[1].set_ylabel("Density")
axs[1].legend()

bins = np.linspace(train_positive["AGE"].min(), train_positive["AGE"].max(), 20)
axs[2].hist(
    [
        train_positive["AGE"].to_numpy(),
        train_negative["AGE"].to_numpy(),
    ],
    label=["positive", "negative"],
    bins=bins,
    color=[COLORS[1], COLORS[2]],
    density=True,
)
axs[2].set_xlabel("Age")
axs[2].set_ylabel("Density")
axs[2].legend()

plt.savefig("report/figures/positive_negative_histograms.png", dpi=300)
plt.show()