In [2]:
# imports
from sklearn import preprocessing 
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np
import random
import os


In [3]:
# set working directory 
load_dotenv()
os.chdir(os.getenv('DEFAULT_PATH'))

print('In Predictive Model Folder:', os.getcwd())

In Predictive Model Folder: C:\Users\ashly\OneDrive\Documents\Education Material\ResearchProject\MaternalHealthResearch\predictive-model


Step 1: Create the HeartRate and METS merged dataset, filter null values and reduce the memory usage first

In [4]:
def filterDataset(df):
    print('Filter Dataset')

    # view details of the dataset
    print(df.head())
    print('Column Names:',df.dtypes)

    # print null values in the dataframe
    print('The sum of null values are:', df.isnull().sum())
    
    
    # drop rows with null values
    print('Count of cells BEFORE dropping null:', df.size,'\n')
    df = df.dropna() 
    print('Count of cells AFTER dropping null:', df.size, '\n')
    print('------------------------------------------------------------------')
    
    return df

In [5]:
def parseDateTime(df):
    # datetime split
    split_datetime = df['timestamp'].str.split(' ', expand=True)

    # Assign the date and time components to new columns
    df['date'] = split_datetime[0]
    df['time'] = split_datetime[1]
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = df['time'].astype(str)

    df.drop(columns=['timestamp'], inplace=True)
    df[:3]
    print('------------------------------------------------------------------')
    return df

In [6]:
def reduceMemoryUsage(df, verbose=True):
    print('Reduce Memory')
    
    numerics = {
        np.int8: (np.iinfo(np.int8).min,np.iinfo(np.int8).max),
        np.int16: (np.iinfo(np.int16).min,np.iinfo(np.int16).max), 
        np.int32: (np.iinfo(np.int32).min,np.iinfo(np.int32).max), 
        np.int64: (np.iinfo(np.int64).min,np.iinfo(np.int64).max), 
        np.float16: (np.finfo(np.float16).min,np.finfo(np.float16).max), 
        np.float32: (np.finfo(np.float32).min,np.finfo(np.float32).max), 
        np.float64: (np.finfo(np.float64).min,np.finfo(np.float64).max)
        }
    types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_memory_usage = df.memory_usage(deep=True).sum() / (1024 ** 2)
    print('Starting memory usage is {:5.5f}'.format(start_memory_usage))
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in types: 
            c_min = df[col].min()
            c_max = df[col].max()
            for n_key, n_value in numerics.items(): 
                if c_min > n_value[0] and c_max < n_value[1]:
                    df[col] = df[col].astype(n_key)
                    break
    
    end_memory_usage = df.memory_usage(deep=True).sum() / (1024**2)
    if verbose: 
        print('Memory usage decreased to {:5.5f} Mb ({:.5f}% reduction)'.format(end_memory_usage, 100 * (start_memory_usage - end_memory_usage) / start_memory_usage))
    print('------------------------------------------------------------------')
        
    return df

In [7]:
def removeOutliers(group, target='bpm'):
   z_scores = stats.zscore(group[target])
   threshold = 3
   outlier_indices = group.index[abs(z_scores) > threshold]
   return group.drop(outlier_indices)

In [8]:
def normalizeMinMax(df,column_name):
    min_value = df[column_name].min()
    max_value = df[column_name].max()
    x = df[[column_name]].values
    x = x.reshape(-1, 1)
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    x_scaled = min_max_scaler.fit_transform(x)
    df[column_name] = x_scaled
    return df


In [9]:
def createHRMetsDataset():
    ## Merge cleaned dataframes
    
    # read the heartrate and mets dataframe
    df_mets = pd.read_csv('data_raw/RAW-Fitabase Data 4.12.16-5.12.16/minuteMETsNarrow_merged.csv')
    df_heartrate = pd.read_csv('data_raw/RAW-Fitabase Data 4.12.16-5.12.16/heartrate_seconds_merged.csv')
    df_intensities = pd.read_csv('data_raw/RAW-Fitabase Data 4.12.16-5.12.16/minuteIntensitiesNarrow_merged.csv')
    
    # update columns names to be the same
    df_mets.columns = ['id', 'timestamp', 'mets']
    df_heartrate.columns = ['id', 'timestamp', 'bpm']
    df_intensities.columns = ['id','timestamp', 'intensity_level']
        
    # clean dataframes and reduce memory usage
    df_mets = filterDataset(df_mets)
    df_mets = reduceMemoryUsage(df_mets)
    # df_mets = normalizeMinMax(df_mets, 'mets')
    
    df_heartrate = filterDataset(df_heartrate)
    df_heartrate = reduceMemoryUsage(df_heartrate)
   
    df_intensities = filterDataset(df_intensities) 
    df_intensities = reduceMemoryUsage(df_intensities) 
    
    # merge dataframes using the column names and with an inner join
    df_merged_outer = pd.merge(df_mets, df_heartrate, on=['id', 'timestamp'], how='outer')
    df_merged_inner = pd.merge(df_mets, df_heartrate, on=['id', 'timestamp'], how='inner')
    df_merged_right = pd.merge(df_mets, df_heartrate, on=['id', 'timestamp'], how='right')
    
    df_merged_outer = pd.merge(df_intensities, df_merged_outer, on=['id', 'timestamp'], how='outer')
    df_merged_inner = pd.merge(df_intensities, df_merged_inner, on=['id', 'timestamp'], how='inner')
    df_merged_right = pd.merge(df_intensities, df_merged_right, on=['id', 'timestamp'], how='right')
    
    # remove outliers 
    g1 = df_merged_outer.groupby('intensity_level')
    g2 = df_merged_inner.groupby('intensity_level')
    g3 = df_merged_right.groupby('intensity_level')
    
    df_merged_outer = g1.apply(removeOutliers)
    df_merged_inner = g2.apply(removeOutliers) 
    df_merged_right = g3.apply(removeOutliers)
   
    df_merged_inner.to_csv('data_interim/heartrate_mets_intensities_merged_outer.csv', index=False)
    df_merged_inner.to_csv('data_interim/heartrate_mets_intensities_merged_inner.csv', index=False)
    df_merged_right.to_csv('data_interim/heartrate_mets_intensities_merged_right.csv', index=False)
    
    # create heartrate dataset of cleaned results
    df_heartrate.to_csv('data_interim/heartrate_fiveseconds_intervals.csv', index=False)
   
    print('Heartrate, METS, and minuteIntensity dataframes are merged')
    df_merged_inner[:3]
    df_merged_right[:3]

In [9]:
createHRMetsDataset()

Filter Dataset
           id              timestamp  mets
0  1503960366  4/12/2016 12:00:00 AM    10
1  1503960366  4/12/2016 12:01:00 AM    10
2  1503960366  4/12/2016 12:02:00 AM    10
3  1503960366  4/12/2016 12:03:00 AM    10
4  1503960366  4/12/2016 12:04:00 AM    10
Column Names: id            int64
timestamp    object
mets          int64
dtype: object
The sum of null values are: id           0
timestamp    0
mets         0
dtype: int64
Count of cells BEFORE dropping null: 3976740 

Count of cells AFTER dropping null: 3976740 

------------------------------------------------------------------
Reduce Memory
Starting memory usage is 107.41986
Memory usage decreased to 99.83484 Mb (7.06111% reduction)
------------------------------------------------------------------
Filter Dataset
           id             timestamp  bpm
0  2022484408  4/12/2016 7:21:00 AM   97
1  2022484408  4/12/2016 7:21:05 AM  102
2  2022484408  4/12/2016 7:21:10 AM  105
3  2022484408  4/12/2016 7:21:20 AM  10

  df_merged_outer = g1.apply(removeOutliers)
  df_merged_inner = g2.apply(removeOutliers)
  df_merged_right = g3.apply(removeOutliers)


Heartrate, METS, and minuteIntensity dataframes are merged


Step 2: Sleep & Actvity Tracking Dataset Cleaning (Select the most important attributes and convert each dataset into interim ones)

In [10]:
def processContextualDatasets():
    raw_data_path = f'{os.getcwd()}/data_raw/RAW-Fitabase Data 4.12.16-5.12.16/'
    
    ## Cleaning SleepDay Dataset
    df_sleepDay = pd.read_csv(raw_data_path+'sleepDay_merged.csv')
    df_sleepDay.columns = ['id', 'timestamp', 'total_sleep_records', 'total_minutes_asleep', 'total_time_inbed']
    df_sleepDay.drop(columns=['total_time_inbed'], inplace=True)
    
    df_sleepDay = filterDataset(df_sleepDay)
    df_sleepDay = reduceMemoryUsage(df_sleepDay)
    df_sleepDay = parseDateTime(df_sleepDay)
    df_sleepDay.drop(columns=['time'], inplace=True)
    
    # Write Dataset to csv 
    interim_data_path = f'{os.getcwd()}/data_interim/'
    df_sleepDay.to_csv(f'{interim_data_path}daily_sleep.csv', index=False)
    
    ## Cleaning Activity Dataset
    df_dailyActivity = pd.read_csv(f'{raw_data_path}dailyActivity_merged.csv')
    df_dailyActivity = df_dailyActivity.iloc[:,[0,1,2,3,10,11,12,13,14]]
    df_dailyActivity.columns = ['id', 'date', 'total_steps', 'total_distance_miles', 'very_active_minutes', 'fairly_active_minutes', 'lightly_active_minutes', 'sedentary_minutes', 'calories']

    df_dailyActivity = filterDataset(df_dailyActivity)
    df_dailyActivity = reduceMemoryUsage(df_dailyActivity)
    
    # merge sleep day and activity day
    df_merged = pd.merge(df_sleepDay, df_dailyActivity, on=['id', 'date'], how='inner')
    df_merged.to_csv(f'{interim_data_path}daily_sleep_activity.csv', index=False)
    
    print('Daily Activity and Sleep is Merged')
    df_merged[:3]

In [None]:
processContextualDatasets()

Step 3. Generate ManualInputDataset.csv

In [12]:
def generateManualInput():
    ids = [150390366, 1927972279, 2873212765, 4319703577, 4558609924, 5577150313, 6962181067, 8877689391]
    start_date = datetime(2016, 4, 12)
    end_date = datetime(2016, 5, 12)
    date_range = pd.date_range(start=start_date, end=end_date)

    data = []
    for _id in ids:
        initial_kg = random.uniform(50, 120)
        initial_bmi = random.uniform(18.5, 30)
        initial_fat = random.uniform(15, 35)
        direction_two = 0
        for date in date_range:
            weight_kg = initial_kg + direction_two
            weight_pounds = initial_kg * 2.20462
            bmi = initial_bmi + direction_two
            timestamp = date
            fat = initial_fat + direction_two
            calorie_consumption = random.randint(1000, 3000)
            doctor_visit = random.choice([True, False])
            symptom_code = random.choice(['Nausea and Vomiting','Fatigue','None','Headaches','Back Pain','Swelling in Extremities','Heartburn','Constipation','None','Frequent Urination','Braxton Hicks Contractions','Round Ligament Pain', 'None'])
            systolic_bp = random.randint(100, 180)
            diastolic_bp = random.randint(60, 100)
            glucose_morning = random.randint(70, 110)
            glucose_evening = random.randint(70, 110)
            mental_health_code = random.choice( ['None','Anxiety','Depression','None','Stress','Mood Swings','Insomnia','Postpartum Depression','Adjustment Disorder','None','Pregnancy-related OCD','Body Image Issues','Relationship Strain', 'Other', 'None'])

            data.append([_id, weight_kg, weight_pounds, bmi, timestamp, fat, calorie_consumption, 
                         doctor_visit, symptom_code, systolic_bp, diastolic_bp, glucose_morning, 
                         glucose_evening, mental_health_code])
            
            direction_two = random.choice([-2, -1, 0, 1, 2])

    df_manual_input = pd.DataFrame(data, columns=['id', 'weight_kg', 'weight_pounds', 'bmi', 'timestamp', 
                                     'fat', 'calorie_consumption', 'doctor_visit', 'symptom_code', 
                                     'systolic_bp', 'diastolic_bp','glucose_morning', 'glucose_evening', 
                                     'mental_health_code'])

    
    print(df_manual_input.head())
    df_manual_input.to_csv('data_interim/logged_input.csv', index=False)


In [13]:
generateManualInput()

          id  weight_kg  weight_pounds        bmi  timestamp        fat  \
0  150390366  82.430283     181.727451  23.681953 2016-04-12  27.944216   
1  150390366  80.430283     181.727451  21.681953 2016-04-13  25.944216   
2  150390366  83.430283     181.727451  24.681953 2016-04-14  28.944216   
3  150390366  84.430283     181.727451  25.681953 2016-04-15  29.944216   
4  150390366  84.430283     181.727451  25.681953 2016-04-16  29.944216   

   calorie_consumption  doctor_visit                symptom_code  systolic_bp  \
0                 1938          True  Braxton Hicks Contractions          174   
1                 1807          True  Braxton Hicks Contractions          105   
2                 1612          True                Constipation          170   
3                 2774         False          Frequent Urination          112   
4                 2531         False                   Heartburn          109   

   diastolic_bp  glucose_morning  glucose_evening     mental_h