# Data Import Module
The purpose of this notebook is to act as a module that can be imported into the Master Notebook to read in various data depending on the data acquisition source.

## Indoor Environmental Quality Data
Used for import of:
- Temperature
- Relative Humidity
- PM2.5 Concentration

In [1]:
def ieqImport(ID, sensor, name_list, column_list='all', starting='03/11/2019', ending='04/15/2019'):
    '''
    Inputs:
        - ID: string representing the ID number
        - sensor: string representing the sensor which corresponds to the directory to search through for data gathered from that
            sensor
        - name_list: a list of strings to use as the column labels in the dataframe
        - column_list: a list of numbers that correspond to the columns to be imported, default is 'all'
        - starting: string representing the first date to use in the data range
        - ending: string representing the last date to use in the data range
    Returns a dataframe containing the timestamp and the values for variables defined by the sensor type
    '''
    # Important variables
    df = pd.DataFrame() # Dataframe to return
    start_date = datetime.strptime(starting, '%m/%d/%Y') # converting input to datetime
    end_date = datetime.strptime(ending, '%m/%d/%Y') # converting input to datetime
        
    # Location of file
    DIR = 'Data/Student' + ID + '/beacon_data/bevo/' + sensor + '/'
    
    temp = pd.DataFrame() # Stores one csv file's worth of data
    raw_data = pd.DataFrame() # Appends each data file together (multiple temp files)
    # Looping through all the files in the sensor directory
    for file in os.listdir(DIR):
        if str(file[-3:]) == 'csv': # To ensure that we only read in csv files
            if column_list == 'all': # if no column numbers are specified, the default is to read them all
                temp = pd.read_csv(DIR + file,header=None,names=name_list)
            else:
                temp = pd.read_csv(DIR + file,header=None,names=name_list,usecols=column_list)
                
            raw_data = pd.concat([raw_data,temp],axis=0,ignore_index=True)

    # Creating a date array for indexing that converts utctimestamp to Central Time
    raw_data = raw_data.dropna() # Dropping any NaNs
    t = np.zeros((len(raw_data)),dtype='datetime64[ns]') # Array to store times
    for j in range(len(t)):
        ts = int(raw_data['time'].values[j])
        ## Converting from UTC to specified format
        t[j] = datetime.strptime(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),'%Y-%m-%d %H:%M:%S') - timedelta(hours=5)

    # Re-indexing and re-naming
    raw_data['time_index'] = t
    raw_data = raw_data.set_index('time_index') # Setting time as the dataframe index
    raw_data = raw_data.sort_index()
    raw_data = raw_data.drop(['time'],axis=1)

    # Removing data from DF that isn't in the deployment range
    ## Checking to see if there is data in the range
    if raw_data.index[-1] < start_date:
        print('\tNo data from this deployment range')
        return df
    ## Checking to see if we are importing one day's worth of data
    elif start_date == end_date:
        raw_data = raw_data[raw_data.index.month == start_date.month] # mask by month to ensure only one day
        raw_data = raw_data[raw_data.index.day == start_date.day] # mask by the day
        
        # Storing the cleaned data to the final dataframe
        print('\tNumber of datapoints: ' + str(len(raw_data)))
        df = raw_data

        # Returning dataframe with cleaned data
        return df
    else:
        ## Variables to store the correct indexes
        start_index = 0
        end_index = -1
        ## Looping through all values read in
        for j in range(len(raw_data)):
            if raw_data.index[j].month == start_date.month and raw_data.index[j].day == start_date.day:
                ### Once we find the month and date, we want to break so that we store the first entry from that day
                start_index = j
                break
            if raw_data.index[j] > start_date:
                ### In the rare case we tried to import a day that is not present in the dataset, we have to find the next closest
                start_index = j
                break
        ## Removing the data gathered before the start index/start date
        raw_data = raw_data[start_index:]

        ## Looping through all values read in (minus the part we just got rid of)
        for j in range(len(raw_data)):
            if raw_data.index[j] > end_date:
                end_index = j-1
                break

        ## Removing any data that remains after the ending index/end date
        raw_data = raw_data[0:end_index]
        
        # Storing the cleaned data to the final dataframe
        print('\tNumber of datapoints: ' + str(len(raw_data)))
        df = raw_data

        # Returning dataframe with cleaned data
        return df

## Fitbit Data
Used for import of:
- Sleep Stages

In [2]:
def fitbitImport(ID, file_name, name_list, column_list, starting='03/11/2019', ending='04/15/2019'):
    '''
    Inputs:
        - ID: string representing the ID number
        - file_name: a string that contains the csv file we are looking for
        - name_list: a list of strings to use as the column labels in the dataframe
        - column_list: a list of numbers that correspond to the columns to be imported, default is 'all'
        - starting: string representing the first date to use in the data range
        - ending: string representing the last date to use in the data range
    Returns a dataframe containing the timestamp and the measured variables that correspond to the file_name variable
    '''
    # Important variables
    df = pd.DataFrame() # Series to return
    start_date = datetime.strptime(starting, '%m/%d/%Y') # converting input to datetime
    end_date = datetime.strptime(ending, '%m/%d/%Y') # converting input to datetime
    
    ## Location of file
    DIR = 'Data/Student' + ID + '/'
    
    # Importing the data from file
    try:
        raw_data = pd.read_csv(DIR + file_name, header=0, names=name_list, usecols=column_list)
    except FileNotFoundError:
        print('No file found. Wrong path or no data available.')
            
    ## Converting the time column to datetime
    raw_data['Time'] = pd.to_datetime(raw_data['Time'], format="%m/%d/%Y %I:%M:%S %p")
    raw_data = raw_data.set_index('Time') # Setting time as the dataframe index
    raw_data = raw_data.sort_index()

    # Removing data from DF that isn't in the deployment range
    ## Checking to see if there is data in the range
    if raw_data.index[-1] < start_date:
        print('\tNo data from this deployment range')
        return df
    ## Checking to see if we are importing one day's worth of data
    elif start_date == end_date:
        raw_data = raw_data[raw_data.index.month == start_date.month] # mask by month to ensure only one day
        raw_data = raw_data[raw_data.index.day == start_date.day] # mask by the day
        
        # Storing the cleaned data to the final dataframe
        print('\tNumber of datapoints: ' + str(len(raw_data)))
        df = raw_data

        # Returning dataframe with cleaned data
        return df
    else:
        ## Variables to store the correct indexes
        start_index = 0
        end_index = -1
        ## Looping through all values read in
        for j in range(len(raw_data)):
            if raw_data.index[j].month == start_date.month and raw_data.index[j].day == start_date.day:
                ### Once we find the month and date, we want to break so that we store the first entry from that day
                start_index = j
                break
            if raw_data.index[j] > start_date:
                ### In the rare case we tried to import a day that is not present in the dataset, we have to fine the next closest
                start_index = j
                break
        
        ## Removing the data gathered before the start index/start date
        raw_data = raw_data[start_index:]

        ## Looping through the remaining values
        for j in range(len(raw_data)):
            if raw_data.index[j] > end_date:
                end_index = j-1
                break
        ## Removing the data gathered before the start index/start date
        raw_data = raw_data[0:end_index]
        
        # Storing the cleaned data to the final dataframe
        print('\tNumber of datapoints: ' + str(len(raw_data)))
        df = raw_data

        # Returning dataframe with cleaned data
        return df

## Beiwe Survey Data

In [10]:
# Reading in the data file
def surveyImport(ID,starting='03/11/2019', ending='04/15/2019'):
    '''
    Inputs:
        -
    Returns 
    '''
    # Location of file
    DIR = 'Data/Student' + ID + '/beiwe_data/sleep_surveys/'

    nights = []
    sleep_time = []
    restful_scores = []
    refresh_scores = []
    aggregate = [] # Sleep score based on summing all values
    rr = [] # Sleep score just based on refresh and restful
    
    #Date Range
    start_date = datetime.strptime(starting, '%m/%d/%Y') # converting input to datetime
    end_date = datetime.strptime(ending, '%m/%d/%Y') # converting input to datetime
    
    for file in os.listdir(DIR):
        numerics = []
        # Checking to see if the file is a csv and that date already hasn't been imported
        if file[-3:] == 'csv':
            file_date = datetime.strptime(file[:10],'%Y-%m-%d')
            # Checking to make sure we stay in the date range
            if file_date > start_date and file_date <= end_date:
                nights.append(file_date)
                raw_data = pd.read_csv(DIR + file,header=None,usecols=[2,4],skiprows=2,nrows=3,names=['Question','Answer'])
                ## Getting average number of hours slept
                if raw_data['Answer'][0] == 'NaN' or raw_data['Answer'][0] == 'NOT_PRESENTED':
                    sleep_time.append(0)
                else:
                    sleep_time.append((int(raw_data['Answer'][0][0]) + int(raw_data['Answer'][0][2]))/2.0)
                ## Getting numeric score for restfulness
                if raw_data['Answer'][1] == 'Not at all restful':
                    restful_scores.append(0)
                elif raw_data['Answer'][1] == 'Slightly restful':
                    restful_scores.append(1)
                elif raw_data['Answer'][1] == 'Somewhat restful':
                    restful_scores.append(2)
                elif raw_data['Answer'][1] == 'Very restful':
                    restful_scores.append(3)
                else:
                    restful_scores.append(-1)
                ## Getting numeric score for refreshedness
                if raw_data['Answer'][2] == 'Not at all refreshed':
                    refresh_scores.append(0)
                elif raw_data['Answer'][2] == 'Slightly refreshed':
                    refresh_scores.append(1)
                elif raw_data['Answer'][2] == 'Somewhat refreshed':
                    refresh_scores.append(2)
                elif raw_data['Answer'][2] == 'Very refreshed':
                    refresh_scores.append(3)
                else:
                    refresh_scores.append(-1)
                ## Getting Sleep Scores
                aggregate.append(sleep_time[-1]+restful_scores[-1]+refresh_scores[-1])
                rr.append(restful_scores[-1]+refresh_scores[-1])
       
    # Sorting by day and returning
    d = {'Night': nights, 'Time_Asleep': sleep_time, 'Restful': restful_scores, 'Refreshed': refresh_scores,
        'Aggregate': aggregate,'Refresh/Relax': rr}
    df = pd.DataFrame(data=d)
    df = df.set_index('Night')
    df = df.sort_index()
    print('\tNumber of surveys imported: ' + str(len(df)))
    return df