# Data Derivatives Module
The purpose of this notebook is to act as a module that can be imported into the Master Notebook to derive certain features from raw data.

## Fine Particulate Matter (PM2.5) Derivatives
Below are functions that derive the following features:
1. Air Quality Index (AQI)
2. Concentration During Sleep

### Air Quality Index
The scale for the PM2.5 Air Quality Index (AQI) can be found [here](https://aqicn.org/calculator/) with an accompanying calculator.

In [9]:
def getAQI(concentration):
    '''
    Input:
        - concentration: numpy float array holding the PM2.5 concentrations in ug/m^3
    Returns the PM2.5 concentration as and air quality index
    '''
    aqi = []
    for C in concentration:
        if C <= 12.0:
            aqi_score = round(C/12.0 * 50.0)
            aqi.append(aqi_score)
        elif C <= 35.4:
            aqi_score = round(50 + (C-12.1)/(35.4-12.1) * (100-50))
            aqi.append(aqi_score)
        elif C <= 55.4:
            aqi_score = round(100 + (C-35.5)/(55.4-35.5) * (150-100))
            aqi.append(aqi_score)
        elif C <= 150.4:
            aqi_score = round(150 + (C-55.5)/(150.4-55.5) * (200-150))
            aqi.append(aqi_score)
        elif C <= 250.4:
            aqi_score = round(200 + (C-150.5)/(250.4-150.5) * (300-200))
            aqi.append(aqi_score)
        elif C <= 350.4:
            aqi_score = round(300 + (C-250.5)/(350.4-250.5) * (400-300))
            aqi.append(aqi_score)
        else:
            aqi_score = round(400 + (C-350.5)/(500.4-350.5) * (500-400))
            aqi.append(aqi_score)
            
    return aqi

In [1]:
def getSleepConcentration(concentration, sleep_stages,visualize = False):
    '''
    Inputs:
        - concentration:
        - sleep_stages: 
    Returns a dataframe holding the concentrations at night 
    '''
    short_date = []
    for i in range(len(sleep_stages)):
        short_date.append(str(sleep_stages.index[i])[0:16])
    sleep_stages['short_date'] = short_date

    short_date = []
    for i in range(len(concentration)):
        short_date.append(str(concentration.index[i])[0:16])
    concentration['short_date'] = short_date    

    sleep_concentration = sleep_stages.merge(concentration,left_on='short_date',right_on='short_date')

    times = []
    for i in range(len(sleep_concentration)):
        times.append(datetime.strptime(sleep_concentration['short_date'][i],'%Y-%m-%d %H:%M'))

    # Creating a datetime index
    sleep_concentration['Time'] = times
    sleep_concentration = sleep_concentration.set_index('Time')
    sleep_concentration.drop('short_date',axis=1)
            
    if visualize == True:
        # Plotting to see if sleep concentration coincides with sleep time
        fig, ax = plt.subplots(figsize = (16,8))
        ax.scatter(sleep_concentration.index,sleep_concentration['PM2.5'].values)
        ax.scatter(sleep_stages.index,np.ones(len(sleep_stages)))
    
    return sleep_concentration

In [17]:
def getPMMetrics(concentration):
    '''
    Inputs:
        - concentration: 
    Returns 
    '''
    times = concentration.index
    locs = []
    for i in range(len(concentration)-1):
        if concentration.index[i+1]-concentration.index[i] > timedelta(hours = 1):
            locs.append(i+1) 

    concentration_byDay = np.split(concentration,locs)
    times_byDay = np.split(times,locs)
    
    night = []
    peaks = []
    medians = []
    
    for i in range(len(concentration_byDay)):
        night.append(datetime.strptime(str(times_byDay[i][0])[0:10],'%Y-%m-%d'))
        peaks.append(max(concentration_byDay[i]))
        medians.append(concentration_byDay[i].median())
        
    d = {'Night': night, 'Peak': peaks, 'Median': medians}
    df = pd.DataFrame(data=d)
    df = df.set_index('Night')
    return df

## Sleep Stages/Quality Derivatives
Below are functions that derive the following features:
1. Sleep Metrics including:
    - Sleep Latency
    - Sleep Efficiency
    - Time Spent Asleep
    - Percent of Time Spent Awake
    - Percent of Time Spent in REM
    - Percent of Time Spent in non-REM
    - Periodic Wake Episodes
2. Sleep Efficiency Grade

### Sleep Efficiency
Sleep efficiency is defined as the percentage of time asleep while in bed.

In [7]:
def getSleepMetrics(sleep_stages):
    '''
    Inputs:
        - sleep_stages: Dataframe holding the time and sleep stage label
    Returns a dataframe with the date and various sleep metrics
    '''
    stages = sleep_stages['Stage_Label']
    times = sleep_stages.index
    locs = []
    for i in range(len(sleep_stages)-1):
        # Parsing out the days by looking for timesteps greater than 5 minutes
        if sleep_stages.index[i+1]-sleep_stages.index[i] > timedelta(seconds = 300):
            locs.append(i+1)

    stages_byDay = np.split(stages,locs)
    times_byDay = np.split(times,locs)
    
    latency = []
    efficiency = []
    night = []
    time_asleep = []
    awake_percentage = []
    rem_percentage = []
    nonrem_percentage = []
    
    for i in range(len(stages_byDay)):
        # Checking to see if the person was in bed for at least 2 hours (120 30-second periods)
        if len(stages_byDay[i]) > 119:
            night.append(datetime.strptime(str(times_byDay[i][0])[0:10],'%Y-%m-%d'))
            time_asleep.append(len(stages_byDay[i])*30/60/60)
            n = 0
            while stages_byDay[i][n] == 'wake':
                n += 1
               
            latency.append((n*30)/60/60)
            wake_count = 0
            rem_count = 0
            nonrem_count = 0
            for j in range(len(stages_byDay[i])):
                if stages_byDay[i][j] == 'wake':
                    wake_count += 1
                elif stages_byDay[i][j] == 'rem':
                    rem_count += 1
                else:
                    nonrem_count += 1
                    
            efficiency.append((1 - wake_count/len(stages_byDay[i]))*100)
            awake_percentage.append(wake_count/len(stages_byDay[i])*100)
            rem_percentage.append(rem_count/len(stages_byDay[i])*100)
            nonrem_percentage.append(nonrem_count/len(stages_byDay[i])*100)
        
    d = {'Night': night, 'Time_Asleep': time_asleep, 'Latency': latency, 'Efficiency': efficiency,
         'Awake %': awake_percentage,
         'REM %': rem_percentage,
         'Non-REM %': nonrem_percentage}
    df = pd.DataFrame(data=d)
    df = df.set_index('Night')
    return df

In [13]:
def getSleepEfficiencyGrade(sleep_metrics):
    '''
    Inputs:
        - sleep_metrics: a dataframe that holds different sleep metrics, indexed by the night
    Returns a new dataframe with a "Efficiency_Grade" column that holds the letter grade of their sleep efficiency
    '''
    grade = [] # list holding the grade
    for i in range(len(sleep_metrics)):
        if sleep_metrics['Efficiency'][i] >= 90:
            grade.append('A')
        elif sleep_metrics['Efficiency'][i] < 90 and sleep_metrics['Efficiency'][i] >= 80:
            grade.append('B')
        elif sleep_metrics['Efficiency'][i] < 80 and sleep_metrics['Efficiency'][i] >= 70:
            grade.append('C')
        elif sleep_metrics['Efficiency'][i] < 70 and sleep_metrics['Efficiency'][i] >= 60:
            grade.append('D')
        else:
            grade.append('F')
            
    sleep_metrics['Efficiency_Grade'] = grade
    return sleep_metrics