# NASA Breath Diagnostics Challenge

## Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [None]:
STAGING_DATA_FOLDER = "staging_data"
FIGURES_FOLDER = "figures"

In [None]:
train_readings_df = pd.read_csv(os.path.join(STAGING_DATA_FOLDER, "train_readings.csv"))
train_patients_df = pd.read_csv(os.path.join(STAGING_DATA_FOLDER, "train_patients.csv"))

In [None]:
train_readings_df.describe()

In [None]:
train_patients_df.describe()

In [None]:
train_readings_df.groupby("Patient ID").size().reset_index()

In [None]:
train_readings_df[~train_readings_df["Patient ID"].isin([20, 53])].describe()

In [None]:
train_readings_df = train_readings_df[train_readings_df["Time"] < 841.0]
train_readings_df.describe()

In [None]:
train_readings_df.groupby("Patient ID").size().reset_index()

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16, 12))
ax[0][0].set_title("Bin Width = 0.5")
_ = sns.histplot(train_readings_df, x="Time", binwidth=0.5, kde=True, ax=ax[0][0])
ax[0][1].set_title("Bin Width = 1")
_ = sns.histplot(train_readings_df, x="Time", binwidth=1, kde=True, ax=ax[0][1])
ax[1][0].set_title("Bin Width = 2")
_ = sns.histplot(train_readings_df, x="Time", binwidth=2, kde=True, ax=ax[1][0])
ax[1][1].set_title("Bin Width = 2.26")
_ = sns.histplot(train_readings_df, x="Time", binwidth=2.26, kde=True, ax=ax[1][1])
fig.savefig(os.path.join(FIGURES_FOLDER, "time_histograms.png"), bbox_inches="tight")

In [None]:
PATIENT_IDS = [1, 4, 10, 12]
# 1 and 12 have COVID, 4 and 10 don't have COVID

In [None]:
sample_df = train_readings_df[train_readings_df["Patient ID"].isin(PATIENT_IDS)]
# sample_df = sample_df.drop(columns=[x for x in sample_df.columns if x == "Patient ID"])
sample_df

In [None]:
fig, ax = plt.subplots(8, 8, figsize=(64, 64))
for i in range(64):
    _ = sns.lineplot(sample_df, x="Time", y=f"D{i+1}", hue="Patient ID", palette="tab10", ax=ax[i // 8][i % 8])
fig.savefig(os.path.join(FIGURES_FOLDER, "raw_samples_timeline.png"), bbox_inches="tight")
plt.close()

![Raw Samples on Timeline](figures/raw_samples_timeline.png)

In [None]:
patient_means = sample_df[sample_df["Time"] < 301].groupby("Patient ID").mean().reset_index().drop(columns="Time")
result = sample_df.merge(patient_means, on="Patient ID", suffixes=("_sample", "_mean"))
result

In [None]:
for col in [f"D{i+1}" for i in range(64)]:
    result[col] = result[f"{col}_sample"] - result[f"{col}_mean"]
    result.drop(columns=[f"{col}_sample", f"{col}_mean"], inplace=True)

In [None]:
result

In [None]:
fig, ax = plt.subplots(8, 8, figsize=(64, 64))
for i in range(64):
    col = f"D{i+1}"
    _ = sns.lineplot(result, x="Time", y=col, hue="Patient ID", palette="tab10", ax=ax[i // 8][i % 8])
fig.savefig(os.path.join(FIGURES_FOLDER, "raw_samples_timeline_trend_removed.png"), bbox_inches="tight")
plt.close()

![Raw Samples on Timeline with Trend Removed](figures/raw_samples_timeline_trend_removed.png)