### Helper Functions to:
- Recenter the values of Days such that exposure starts at Day 0,
rather than cycles
- Fill in the missing values for an individual's study
- Repeat the above for all subjects, and save to new CSVs in `transformed_csvs` directory

In [14]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_rows', None)

SAMPLING_FREQ = 48

def recenter_df(table, subject):
    """Recenter the Day number in the DataFrame, start at 0
    when the subject is exposed to the disease.
    
    Keyword arguments:
    table -- CSV file to read information from
    subject -- The subject whose study's days will be rearranged
    """
    df = pd.read_csv(table)
    df = df[df['SubjectName'] == subject]
    
    '''
    Get the number of days by which to shift unexposed Day 
    numbers
    '''
    df_unexposed = df[df['Y'] == 0]  
    day_exposed = df_unexposed.iloc[-1]['Day'] + 1
    
    # Concatenate new unexposed values and exposed values
    df['Day'] = np.concatenate((
        df_unexposed[
            'Day'] - day_exposed, df[df[
            'Y'] == 1][
            'Day'].to_numpy()))
    return df.reset_index().drop(columns=['index'])


def fill_missing_values(table, subject):
    """Fill in the missing values to account for sampling
    inconsistencies.
    
    Keyword arguments:
    table -- CSV file to read information from
    subject -- The subject whose study's missing values will be
    filled with each variable's mean
    """
    df = recenter_df(table, subject)
    days = np.unique(df['Day'])
    df_new = pd.DataFrame()
    concat = []
    
    # Iterate through each Day of study for subject
    for day in days:
        df_day = df[df['Day'] == day]
        
        '''
        Determine whether a day needs to be "resampled"
        by filling in missing values, use number of missing
        data
        '''
        num_missing = SAMPLING_FREQ - len(df_day['TimeS'])
        if num_missing != 0:
            # Fill missing values with mean of each variable
            to_concat = df_day.mean().to_frame().T

            # Repeat number of times equal to num_missing
            to_concat_df = to_concat.loc[
                to_concat.index.repeat(num_missing)]
            
            df_day_new = pd.concat(
                [
                    to_concat_df, df_day], sort=False
            ).reset_index().drop(
                columns=['index'])
            concat.append(df_day_new)
        else:
            concat.append(df_day)
    df_new = pd.concat(concat, sort=False)
    
    # Get rid of NaN values for categorical variables
    df_new['SubjectName'] = df_new[
        'SubjectName'].fillna(subject)
    df_new['Study'] = df_new[
        'Study'].fillna(df['Study'][0])
    return df_new.reset_index().drop(
        columns=['index'])
            

def concat_subjects(table):
    """
    Execute fill_missing_values() for all subjects in the table.
    
    Keyword arguments:
    table -- CSV file whose missing values will be filled for 
    each subject
    """
    df = pd.read_csv(table)
    subjects = np.unique(df['SubjectName'])
    df_new = pd.DataFrame()
    concat = []
    for subject in subjects:
        concat.append(
            fill_missing_values(table, subject))
    df_new = pd.concat(concat, sort=False)
    df_new['Day'] = df_new['Day'].astype(int)
    df_new = df_new.reset_index().drop(
        columns=['index'])
    df_new.to_csv(os.path.join('./transformed_csvs', df['Study'][0] + '_transformed.csv'))
    return df_new

### Fill in Missing Data/Fix Sampling Inconsistencies for the Rest of the CSV Files

In [21]:
def find_csvs(path, suffix=".csv" ):
    """
    Get all the CSV files from a path.
    
    Keword arguments:
    path -- The path in which to look for CSV files
    suffix -- The file extension to look for in the path
    """
    filenames = os.listdir(path)
    return [
        filename for filename in filenames if filename.endswith(
            suffix)]

csvs = find_csvs(os.getcwd())
csvs.remove('SubjectMetaData.csv-Table 1.csv')
for csv in csvs:
    concat_subjects(csv)