In [1]:
# imports
import os
import sys

import pandas as pd

In [2]:
# Here we find our submodule directory, make sure it has notebooks, and add them to path and import

def findSubmoduleDir(path: str) -> str:
    path = os.path.abspath(path)
    if path == "/":
        # Could not find covidclinicaldata in current directory or in any parent directory
        # You can edit the CWPATH variable to point to the containing directory
        raise FileNotFoundError("Could not find the covidclinicaldata/ subdirectory")
    if os.path.isdir(path+'/covidclinicaldata'):
        return path+'/covidclinicaldata'
    return findSubmoduleDir('..')
    
def mustContainNotebooks(path: str):
    if not os.path.isdir(path+'/notebooks'):
        raise FileNotFoundError("Could not find Jupyter notebooks in covidclinicaldata/- is it initialized?")
        
#### DELETE THIS: don't supply system-specific paths to GitHub repos
CWPATH = '.'
DATA_SUFFIX = '.csv' # Assume any CSV is correct data
MOD_DIR = findSubmoduleDir(CWPATH)
mustContainNotebooks(MOD_DIR)
PROJ_DIR = os.path.abspath(MOD_DIR + '/..')
sys.path.insert(0, MOD_DIR+'/notebooks')

from ipynb.fs.full.data_processing import (
    filter_patients,
    filter_pos,
    get_sym_severity,
    get_sym_severity_score,
    is_abnormal_cxr,
    open_data, #This is being overridden
    plot_fill_rates,
    print_data_info,
    SYMPTOMS,
    VITALS,
)

In [3]:
# Read Data

## We'd like to use the original open_data function, but it takes no PATH argument.
## This one also ignores indexes
def open_data(path: str) -> pd.DataFrame:
    '''Open all data in `PATH`.
    Parameters
    ----------
    path : str
        The path to find data.

    
    Returns
    -------
    pandas.DataFrame
    '''    
    return pd.concat(
        [
            pd.read_csv(f'{path}/{filename}') 
            for filename in os.listdir(path) 
            if filename.endswith('.csv')
        ], ignore_index=True
    )

df = open_data(MOD_DIR + '/data')

In [4]:
def normalize(column)-> pd.Series:
    low = column.min()
    width = column.max() - low
    if width == 0:
        return column
    return (column - low)/width

In [5]:
df = df.drop([
    'batch_date',
    'test_name',
    'swab_type',
    'cxr_link',
    'cxr_findings',
    'cxr_impression'
], axis='columns', errors='ignore')

In [6]:
df.replace({
    'Negative': 0,
    'Positive': 1,
    'Significant': 1,
    'Mild': 0,
    'Moderate': .5,
    'Severe': 1,
    False: 0,
    True: 1,
    pd.NA: .5}, inplace=True)

In [7]:
pd.set_option('display.max_columns', None)
# use df.head(N) to test on N first entries
df = df.head(100).apply(func=normalize, axis='index')

In [8]:
# This does nothing, it's left here as scratch
def ajs_normalize(df: pd.Series) -> pd.Series:
    if df[ 'covid19_test_results' ] == "Positive":
        return pd.Series('Yerp')
    elif df[ 'covid19_test_results' ] == "Negative":
        return pd.Series('Nerp')
    else:
        return pd.Series('wut')
#norm_data = data.apply(ajs_normalize, axis=1)