# Data Import Module
The purpose of this notebook is to act as a module that can be imported into the Master Notebook to read in various data depending on the data acquisition source.

## Indoor Environmental Quality Data
Used for import of:
- Temperature
- Relative Humidity

In [12]:
def ieqImport(student_no, directory, name_list, column_list='all', starting='03/11/2019', ending='04/15/2019'):
    '''
    Inputs:
        - deployment_info: a list of strings that contains the semester, deployment tag, deployment number,
            starting date, and ending date of deployment
        - beiwe_ids: a list of the beiwe IDs from this data set
        - beacons: a list of the beacons from this data set
        - directory: the final directory where the data is stored
        - name_list: a list of strings for the columns to be used in the dataframe
        - column_list: a list of numbers that correspond to the columns to be imported, default is 'all'
    Imports different IEQ data variables and returns a Series object that contains dataframes for the specified IEQ variable
    '''
    # Important variables
    df = pd.DataFrame() # Dataframe to return
    start_date = datetime.strptime(starting, '%m/%d/%Y')
    end_date = datetime.strptime(ending, '%m/%d/%Y')
    
    # Importing Data
    for i in range(len(beiwe_ids)):
        id_name = beiwe_ids[i]
        ## Adding a leading zero to numbers < 10
        if int(beacons[i]) < 10:
            beacon_no = '0' + str(beacons[i])
        else:
            beacon_no = str(beacons[i])

        ## Output for visual confirmation
        print('\nReading for Beacon: ' + str(beacon_no))
        print('Reading for Beiwe ID: ' + str(id_name))
        
        ## Location of file
        DIR = str(deployment_info[0]) + '/' + str(deployment_info[1]) + '/Beacon_Data/beacon-d' + str(deployment_info[2]) + '-' + str(beacon_no) +'/bevo/' + directory + '/'

        temp = pd.DataFrame() # Stores one csv file's worth of data
        raw_data = pd.DataFrame() # Appends each data file together
        ## Looping through all the files in the sht31d directory
        for file in os.listdir(DIR):
            if str(file[-3:]) == 'csv': # To ensure that we only read in the csv files
                if column_list == 'all': # if no column numbers are specified, the default is to read them all
                    try:
                        temp = pd.read_csv(DIR + file,header=None,names=name_list)
                    except FileNotFoundError:
                        print('No file found - wrong path')
                else:
                    try:
                        temp = pd.read_csv(DIR + file,header=None,names=name_list,usecols=column_list)
                    except FileNotFoundError:
                        print('No file found - wrong path')
                raw_data = pd.concat([raw_data,temp],axis=0,ignore_index=True)

                ### Getting the file size
                file_size += os.path.getsize(DIR + file)
            
        ## Creating a date array for indexing that converts utctimestamp to Central Time
        raw_data = raw_data.dropna() # Dropping any NaNs
        t = np.zeros((len(raw_data)),dtype='datetime64[ns]') # Array to store times
        for j in range(len(t)):
            ts = int(raw_data['time'].values[j])
            ## Converting from UTC to specified format
            t[j] = datetime.strptime(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'),'%Y-%m-%d %H:%M:%S') - timedelta(hours=5)

        ## Re-indexing and re-naming
        raw_data['time_index'] = t
        raw_data = raw_data.set_index('time_index')
        raw_data = raw_data.sort_index()
        raw_data = raw_data.drop(['time'],axis=1)
        
        ## Removing data from DF that isn't in the deployment range
        ### Variables to store the correct indexes
        start_index = 0
        end_index = -1
        ### Looping through all values read in
        for j in range(len(raw_data)):
            if raw_data.index[j].month == start_date.month and raw_data.index[j].day == start_date.day:
                #### Once we find the month and date, we want to break so that we store the first entry from that day
                start_index = j
                break
        ###  Removing the excess 'head' of data
        raw_data = raw_data[start_index:]

        ### Looping through all values read in (minus the part we just got rid of)
        for j in range(len(raw_data)):
            if raw_data.index[j].month == end_date.month and raw_data.index[j].day == end_date.day:
                ### Once we find the end month and date, we want to stop and store the index just before this because the last day is incomplete
                end_index = j-1
                break
        ### Removing the excess 'tail' of data
        raw_data = raw_data[0:end_index]

        ### Output of data length
        if len(raw_data) < 0:
            print('No data from this deployment')
        else:
            print('Number of datapoints: ' + str(len(raw_data)))

        ## Storing the cleaned data to the final dataframe
        df_byID[id_name] = raw_data
        
    # Returning dataframe with cleaned data
    return df_byID, file_size

## Fitbit Data
Used for import of:
- Step Count (hourly and daily)
- Sleep Stages
- Calories Burned (hourly and daily)
- Intensities (hourly and daily)
- Metabolic Equivalents (METs)
- Heart Rate (1-minute and 15-minute)

In [13]:
def fitbitImport(deployment_info, beiwe_ids, record_ids, file_name, name_list, daily=True):
    '''
    Inputs:
        - deployment_info: a list of strings that contains the semester, deployment tag, deployment number,
            starting date, and ending date of deployment
        - beiwe_ids: a list of string containing the beiwe IDs from this deployment's data set
        - record_ids: a list of strings containing the Fitbit IDs from this deployment's data set
        - file_name: a string that contains the file name we are looking for
        - name_list: a list of strings for the columns to be used in the dataframe
        - interval: a string that specifies whether data is hourly or daily
    Imports different Fitbit data variables and returns a Series object that contains dataframes for the specified Fitbit variable
    '''
    # Important variables
    df_byID = pd.Series() # Series to return
    start_date = deployment_info[3]
    end_date = deployment_info[4]
    DIR = deployment_info[0] + '/' + deployment_info[1] + '/Fitbit_Data/' # Directory where the file should be located
    
    # Importing the data from file
    try:
        raw_data = pd.read_csv(DIR + file_name, header=0, names=name_list)
    except FileNotFoundError:
        print('No file found - wrong path')

    # Parsing out data by Fitbit ID
    ids = [raw_data.iloc[0]['ID']] # variable to store the various ids in the file
    ## Getting the IDs
    for i in range(len(raw_data)-1):
        if raw_data.iloc[i]['ID'] != raw_data.iloc[i+1]['ID']:
            ids.append(raw_data.iloc[i+1]['ID'])
            
    ## Converting the time column to datetime
    #print('Header of raw data\n', raw_data.head())
    if daily == True:
        raw_data['Time'] = pd.to_datetime(raw_data['Time'], format="%m/%d/%Y")
    else:
        raw_data['Time'] = pd.to_datetime(raw_data['Time'], format="%m/%d/%Y %I:%M:%S %p")
        
    raw_data = raw_data.set_index('Time')
    
    ## Storing into new Series
    data_byFBID = pd.Series() # Series holding dataframes for each Fitbit (FB) ID
    for i in range(len(ids)):
        data_byFBID[str(ids[i])] = raw_data[raw_data.ID == ids[i]]
        
    # Converting to Series by Beiwe ID
    df_byID = pd.Series() # Series holding dataframes for each Beiwe ID
    for i in range(len(data_byFBID)):
        for j in range(len(record_ids)):
            if str(data_byFBID.index[i]) == str(record_ids[j]):
                df_byID[str(beiwe_ids[j])] = data_byFBID.iloc[i]

    # Removing data from DF that isn't in the deployment range
    for name in df_byID.index:
        ## Variables to store the correct indexes
        start_index = 0
        end_index = -1
        ## Looping through all values read in
        for j in range(len(df_byID[name])):
            if df_byID[name].index[j].month == start_date.month and df_byID[name].index[j].day == start_date.day:
                ### Once we find the month and date, we want to break so that we store the first entry from that day
                start_index = j
                break
        ## Removing the excess 'head' of data
        df_byID[name] = df_byID[name][start_index:]

        ## Looping through all values read in (minus the part we just got rid of)
        for j in range(len(df_byID[name])):
            if df_byID[name].index[j].month == end_date.month and df_byID[name].index[j].day == end_date.day:
                ### Once we find the end month and date, we want to stop and store the index just before this because the last day is incomplete
                end_index = j-1
                break
        ## Removing the excess 'tail' of data
        df_byID[name] = df_byID[name][0:end_index]
        
    return df_byID

## Beiwe Data
Used for import of:
- GPS
- Accelerometer
- Power State
- Reachability
- Survey Answers

In [6]:
def beiweImport(deployment_info, beiwe_ids, var):
    '''
    Inputs:
        - deployment_info: a list of strings that contains the semester, deployment tag, deployment number,
            starting date, and ending date of deployment
        - beiwe_ids: a list of string containing the beiwe IDs from this deployment's data set
        - var: string of the variable's directory name
    Returns
    '''
    # Reading in the file
    df_all = pd.DataFrame() # dataframe that combines all the data from every participant
    df_byID = pd.Series() # series that stores dataframes for each individual
    for name in beiwe_ids:
        DIR = deployment_info[0] + '/' + deployment_info[1] + '/Beiwe_Data/'
        if var in os.listdir(DIR + str(name)):
            DIR = DIR + name + '/' + var + '/'
            raw_data = pd.DataFrame() # resets on each name
            for file in os.listdir(DIR):
                temp = pd.read_csv(DIR + file) # data from single file
                raw_data = pd.concat([raw_data,temp], sort=False) # data from individual
                df_all = pd.concat([df_all,temp], sort=False) # never resets
            df_byID[name] = raw_data # Storing the data from each individual in the overall series
        else:
            print('No', var, 'data found for ID', name)
            
    print(var.upper(), 'DATA IMPORTED')
    
    return df_byID, df_all