# MLHC Final Project - Data Cleaning and EDA
#### Team members: Hyemin Bang, Alenta Demissew, Rachel Moon

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

## 1. Data: medication status labels

We first load datasets that contain patient's medication status information.

In [None]:
cis = pd.read_csv("BEATPD_data/cis-pd/data_labels/CIS-PD_Training_Data_IDs_Labels.csv")
cis.head()

Using time series data, we aim to predict `on_off` medication status. Dyskinesia and tremor severity is self reported, so we didn't think these would be informative labels to predict. We will not be using these dyskinesia and tremor features in our model. 

In [None]:
cis.describe()

In [None]:
cis.shape

We will exclude datapoints with missing medication status labels.

In [None]:
cis['on_off'].unique()

In [None]:
cis = cis.dropna(subset=['on_off'])
cis.shape

In [None]:
cis.subject_id.nunique()

In [None]:
plt.figure(figsize=(7,5))
plt.title("Distribution of medication status (on_off) labels, CIS-PD dataset", fontsize=15)
plt.xlabel("Label")
plt.ylabel("Frequency")

bins = np.arange(0, cis['on_off'].max() + 1.5) - 0.5
plt.hist(cis['on_off'], bins, edgecolor="black")
plt.xticks(bins + 0.5)

plt.show()

For CIS-PD dataset, `on_off` labels are ordinal (0 to 4), showing data imbalance.

In [None]:
cis.groupby(["subject_id", "on_off"]).count()

## 2. Data: Time series mobile sensor data

We now check how some patients' mobile sensor data looks like. One csv file is one 20-minute interval sensor data for a patient. There are multiple 20-minute interval data for each patient. 

In [None]:
sensor = pd.read_csv("BEATPD_data/cis-pd/training_data/004ed441-24db-4839-8b5d-7465e4ea2a0a.csv")
sensor2 = pd.read_csv("BEATPD_data/cis-pd/training_data/01e0355c-e1a7-4958-a45e-8ab87107fb2b.csv")
sensor3 = pd.read_csv("BEATPD_data/cis-pd/training_data/020b3f90-ea23-459a-993a-af979bc23d2f.csv")

sensor.head(10)

In [None]:
sensor.describe()

In [None]:
sensor.shape

In [None]:
def plot_time_series(data):
    plt.figure(figsize=(15,7))
    for c in ["X", "Y", "Z"]:
        plt.plot(data["time"],data[c], alpha=0.7, label=c)
    plt.legend(loc="best", fontsize=15)
    plt.title("Directional acceleration during 20-minute interval",fontsize=18)
    plt.xlabel("time (sec)")
    plt.ylabel("directional acceleration (gravitational units)")
    plt.show()

In [None]:
for data in [sensor, sensor2, sensor3]:
    data.rename(columns={"Timestamp": "time"},inplace=True)
    plot_time_series(data)

## 3. Data: Patient demographics

Lastly, we also checked the demographics of the patients.

In [None]:
cis_demo = pd.read_csv("BEATPD_data/cis-pd/clinical_data/CIS-PD_Demographics.csv")
cis_demo.shape

There are only total of 21 patients who participated in the study.

In [None]:
cis_demo.head()

In [None]:
plt.figure(figsize=(7,5))
plt.title("Age distribution of patients, CIS-PD", fontsize=18)
plt.hist(cis_demo.Age, label="CIS-PD")
# plt.hist(real_demo.Age, label="REAL-PD", alpha=0.7)
# plt.legend(loc="best", fontsize=13)
plt.xlabel("Age", fontsize=14)
plt.ylabel("frequency",fontsize=14)
plt.show()

In [None]:
cis_demo["Age"].mean()

Both datasets generally have patients who are in their 50-80s. 

In [None]:
cis_demo.groupby("Gender")['subject_id'].count()

CIS-PD dataset is imbalanced in terms of gender, there are twice as many males as female patients.

## REAL-PD data exploration

In [None]:
real = pd.read_csv("BEATPD_data/real-pd/data_labels/REAL-PD_Training_Data_IDs_Labels.csv")
real.head()

real = real.dropna(subset=['on_off'])

In [None]:
plt.figure(figsize=(7,5))
plt.title("Distribution of medication status (on_off) labels, REAL-PD dataset", fontsize=15)
plt.xlabel("Label")
plt.ylabel("Frequency")

bins = np.arange(0, real['on_off'].max() + 1.5) - 0.5
plt.hist(real['on_off'], bins, edgecolor="black")
plt.xticks(bins + 0.5)
plt.show()

In [None]:
real.groupby(["subject_id", "on_off"]).count()

In [None]:
real_demo = pd.read_csv("BEATPD_data/real-pd/clinical_data/REAL-PD_Demographics.csv")

In [None]:
real_demo.groupby("Gender")['subject_id'].count()