In [8]:
# imports
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
# from pyspark.sql import SparkSession
# from rdt.transformers import FloatFormatter
# from rdt.transformers import PseudoAnonymizedFaker 
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np
import random
import sdv
import os


In [2]:
# set working directory // place commands in .env
new_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(new_dir)

print('In Predictive Model Folder:', os.getcwd())
print('sdv version=',sdv.version.public)

In Predictive Model Folder: c:\Users\ashly\OneDrive\Documents\Education Material\ResearchProject\MaternalHealthResearch\predictive-model
sdv version= 1.10.0


Phase 1 - Data Preprocessing 
1. Create Combination of Data for HeartRate to detect activity imbalances
2. Create ManualInput Dataset 
3. Clean all datasets


Step 1: Create the HeartRate and METS merged dataset, filter null values and reduce the memory usage first

In [3]:
def filterDataset(df):
    print('Filter Dataset')

    # view details of the dataset
    print(df.head())
    print('Column Names:',df.dtypes)

    # print null values in the dataframe
    print('The sum of null values are:', df.isnull().sum())
    
    
    # drop rows with null values
    print('Count of cells BEFORE dropping null:', df.size,'\n')
    df = df.dropna() 
    print('Count of cells AFTER dropping null:', df.size, '\n')
    print('------------------------------------------------------------------')
    
    return df

In [4]:
def parseDateTime(df):
    # datetime split
    split_datetime = df['timestamp'].str.split(' ', expand=True)

    # Assign the date and time components to new columns
    df['date'] = split_datetime[0]
    df['time'] = split_datetime[1]
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = df['time'].astype(str)

    df.drop(columns=['timestamp'], inplace=True)
    df[:3]
    print('------------------------------------------------------------------')
    return df

In [5]:
def reduceMemoryUsage(df, verbose=True):
    print('Reduce Memory')
    
    numerics = {
        np.int8: (np.iinfo(np.int8).min,np.iinfo(np.int8).max),
        np.int16: (np.iinfo(np.int16).min,np.iinfo(np.int16).max), 
        np.int32: (np.iinfo(np.int32).min,np.iinfo(np.int32).max), 
        np.int64: (np.iinfo(np.int64).min,np.iinfo(np.int64).max), 
        np.float16: (np.finfo(np.float16).min,np.finfo(np.float16).max), 
        np.float32: (np.finfo(np.float32).min,np.finfo(np.float32).max), 
        np.float64: (np.finfo(np.float64).min,np.finfo(np.float64).max)
        }
    types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_memory_usage = df.memory_usage(deep=True).sum() / (1024 ** 2)
    print('Starting memory usage is {:5.5f}'.format(start_memory_usage))
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in types: 
            c_min = df[col].min()
            c_max = df[col].max()
            for n_key, n_value in numerics.items(): 
                if c_min > n_value[0] and c_max < n_value[1]:
                    df[col] = df[col].astype(n_key)
                    break
    
    end_memory_usage = df.memory_usage(deep=True).sum() / (1024**2)
    if verbose: 
        print('Memory usage decreased to {:5.5f} Mb ({:.5f}% reduction)'.format(end_memory_usage, 100 * (start_memory_usage - end_memory_usage) / start_memory_usage))
    print('------------------------------------------------------------------')
        
    return df

In [9]:
def removeOutliers(group, target='bpm'):
   z_scores = stats.zscore(group[target])
   threshold = 3
   outlier_indices = group.index[abs(z_scores) > threshold]
   return group.drop(outlier_indices)

In [11]:
def createHRMetsDataset():
    ## Merge cleaned dataframes
    
    # read the heartrate and mets dataframe
    df_mets = pd.read_csv('data_raw/RAW-Fitabase Data 4.12.16-5.12.16/minuteMETsNarrow_merged.csv')
    df_heartrate = pd.read_csv('data_raw/RAW-Fitabase Data 4.12.16-5.12.16/heartrate_seconds_merged.csv')
    df_intensities = pd.read_csv('data_raw/RAW-Fitabase Data 4.12.16-5.12.16/minuteIntensitiesNarrow_merged.csv')
    
    # update columns names to be the same
    df_mets.columns = ['id', 'timestamp', 'mets']
    df_heartrate.columns = ['id', 'timestamp', 'bpm']
    df_intensities.columns = ['id','timestamp', 'intensity_level']
        
    # clean dataframes and reduce memory usage
    df_mets = filterDataset(df_mets)
    df_mets = reduceMemoryUsage(df_mets)
    
    df_heartrate = filterDataset(df_heartrate)
    df_heartrate = reduceMemoryUsage(df_heartrate)
   
    df_intensities = filterDataset(df_intensities) 
    df_intensities = reduceMemoryUsage(df_intensities) 
    
    # merge dataframes using the column names and with an inner join
    df_merged_outer = pd.merge(df_mets, df_heartrate, on=['id', 'timestamp'], how='outer')
    df_merged_inner = pd.merge(df_mets, df_heartrate, on=['id', 'timestamp'], how='inner')
    df_merged_right = pd.merge(df_mets, df_heartrate, on=['id', 'timestamp'], how='right')
    
    df_merged_outer = pd.merge(df_intensities, df_merged_outer, on=['id', 'timestamp'], how='outer')
    df_merged_inner = pd.merge(df_intensities, df_merged_inner, on=['id', 'timestamp'], how='inner')
    df_merged_right = pd.merge(df_intensities, df_merged_right, on=['id', 'timestamp'], how='right')
    
    # remove outliers 
    g1 = df_merged_outer.groupby('intensity_level')
    g2 = df_merged_inner.groupby('intensity_level')
    g3 = df_merged_right.groupby('intensity_level')
    
    df_merged_outer = g1.apply(removeOutliers)
    df_merged_inner = g2.apply(removeOutliers) 
    df_merged_right = g3.apply(removeOutliers)
   
    df_merged_inner.to_csv('data_interim/heartrate_mets_intensities_merged_outer.csv', index=False)
    df_merged_inner.to_csv('data_interim/heartrate_mets_intensities_merged_inner.csv', index=False)
    df_merged_right.to_csv('data_interim/heartrate_mets_intensities_merged_right.csv', index=False)
    
    # create heartrate dataset of cleaned results
    df_heartrate.to_csv('data_interim/heartrate_fiveseconds_intervals.csv', index=False)
   
    print('Heartrate, METS, and minuteIntensity dataframes are merged')
    df_merged_inner[:3]
    df_merged_right[:3]

In [12]:
## create the heartrate_mets_merged.csv file with processed data
createHRMetsDataset()

Filter Dataset
           id              timestamp  mets
0  1503960366  4/12/2016 12:00:00 AM    10
1  1503960366  4/12/2016 12:01:00 AM    10
2  1503960366  4/12/2016 12:02:00 AM    10
3  1503960366  4/12/2016 12:03:00 AM    10
4  1503960366  4/12/2016 12:04:00 AM    10
Column Names: id            int64
timestamp    object
mets          int64
dtype: object
The sum of null values are: id           0
timestamp    0
mets         0
dtype: int64
Count of cells BEFORE dropping null: 3976740 

Count of cells AFTER dropping null: 3976740 

------------------------------------------------------------------
Reduce Memory
Starting memory usage is 117.53323
Memory usage decreased to 109.94820 Mb (6.45352% reduction)
------------------------------------------------------------------
Filter Dataset
           id             timestamp  bpm
0  2022484408  4/12/2016 7:21:00 AM   97
1  2022484408  4/12/2016 7:21:05 AM  102
2  2022484408  4/12/2016 7:21:10 AM  105
3  2022484408  4/12/2016 7:21:20 AM  1

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_merged_outer = g1.apply(removeOutliers)


Heartrate, METS, and minuteIntensity dataframes are merged


Normalize the HRMets Dataset - 
1. View the Minimum METS & Heart rate to determine activity patterns in comparison with the intensities dataset
2. look into the min and max values to see the best way to view these outliers and make sure the conditions are accurate --> add activity level column to filter when tracking anomalies

In [16]:
from sklearn.preprocessing import MinMaxScaler
def analyzeHRMetsDataset():
    interim_data_path = f'{os.getcwd()}/data_interim/'
    df_right = pd.read_csv(interim_data_path+'heartrate_mets_merged_right.csv')
    
    

    # Define the min and max values for BPM and METs
    bpm_min = df_right['bpm'].min()
    bpm_max = df_right['bpm'].max()
    mets_min = df_right['mets'].min()
    mets_max = df_right['mets'].max()

    print('BPM MIN:',bpm_min)
    print('BPM MAX:',bpm_max)
    
    print('METS MIN:',mets_min)
    print('METS MAX:',mets_max)

    # Create a MinMaxScaler object
    scaler = MinMaxScaler(feature_range=(0, 1))

    # Fit the scaler on the original data range
    scaler.fit([[bpm_min, mets_min], [bpm_max, mets_max]])

    # Transform the original data to the scaled range
    normalized_bpm_min, normalized_mets_min = scaler.transform([[bpm_min, mets_min]])[0]
    normalized_bpm_max, normalized_mets_max = scaler.transform([[bpm_max, mets_max]])[0]

    print("Normalized BPM MIN:", normalized_bpm_min)
    print("Normalized BPM MAX:", normalized_bpm_max)
    print("Normalized METS MIN:", normalized_mets_min)
    print("Normalized METS MAX:", normalized_mets_max)

 

In [17]:
analyzeHRMetsDataset()

BPM MIN: 36
BPM MAX: 203
METS MIN: 10.0
METS MAX: 144.0
Normalized BPM MIN: 0.0
Normalized BPM MAX: 1.0000000000000002
Normalized METS MIN: 0.0
Normalized METS MAX: 1.0


Step 2: Sleep & Actvity Tracking Dataset Cleaning (Select the most important attributes and convert each dataset into interim ones)

In [6]:
def processContextualDatasets():
    raw_data_path = f'{os.getcwd()}/data_raw/RAW-Fitabase Data 4.12.16-5.12.16/'
    
    ## Cleaning SleepDay Dataset
    df_sleepDay = pd.read_csv(raw_data_path+'sleepDay_merged.csv')
    df_sleepDay.columns = ['id', 'timestamp', 'total_sleep_records', 'total_minutes_asleep', 'total_time_inbed']
    df_sleepDay.drop(columns=['total_time_inbed'], inplace=True)
    
    df_sleepDay = filterDataset(df_sleepDay)
    df_sleepDay = reduceMemoryUsage(df_sleepDay)
    df_sleepDay = parseDateTime(df_sleepDay)
    df_sleepDay.drop(columns=['time'], inplace=True)
    
    # Write Dataset to csv 
    interim_data_path = f'{os.getcwd()}/data_interim/'
    df_sleepDay.to_csv(f'{interim_data_path}daily_sleep.csv', index=False)
    
    ## Cleaning Activity Dataset
    df_dailyActivity = pd.read_csv(f'{raw_data_path}dailyActivity_merged.csv')
    df_dailyActivity = df_dailyActivity.iloc[:,[0,1,2,3,10,11,12,13,14]]
    df_dailyActivity.columns = ['id', 'date', 'total_steps', 'total_distance_miles', 'very_active_minutes', 'fairly_active_minutes', 'lightly_active_minutes', 'sedentary_minutes', 'calories']

    df_dailyActivity = filterDataset(df_dailyActivity)
    df_dailyActivity = reduceMemoryUsage(df_dailyActivity)
    
    # merge sleep day and activity day
    df_merged = pd.merge(df_sleepDay, df_dailyActivity, on=['id', 'date'], how='inner')
    df_merged.to_csv(f'{interim_data_path}daily_sleep_activity.csv', index=False)
    
    print('Daily Activity and Sleep is Merged')
    df_merged[:3]

In [32]:
processContextualDatasets()

Filter Dataset
           id              timestamp  total_sleep_records  \
0  1503960366  4/12/2016 12:00:00 AM                    1   
1  1503960366  4/13/2016 12:00:00 AM                    2   
2  1503960366  4/15/2016 12:00:00 AM                    1   
3  1503960366  4/16/2016 12:00:00 AM                    2   
4  1503960366  4/17/2016 12:00:00 AM                    1   

   total_minutes_asleep  
0                   327  
1                   384  
2                   412  
3                   340  
4                   700  
Column Names: id                       int64
timestamp               object
total_sleep_records      int64
total_minutes_asleep     int64
dtype: object
The sum of null values are: id                      0
timestamp               0
total_sleep_records     0
total_minutes_asleep    0
dtype: int64
Count of cells BEFORE dropping null: 1652 

Count of cells AFTER dropping null: 1652 

------------------------------------------------------------------
Reduce Memo

Step 3: Create ManualInput Dataset
Columns = id, date, time, blood pressure, glucose_morning, glucose_evening weight, calorie consumption, symptoms, mental health - Data Preparation

In [32]:
def generateManualInputMetadata():
    # read the weightLogInfo_merged.csv file and keep the id, date, weight kg/pounds, fat, bmi
    raw_data_path = f'{os.getcwd()}/data_raw/RAW-Fitabase Data 4.12.16-5.12.16/'
    df_weight_log_info = pd.read_csv(raw_data_path+'weightLogInfo_merged.csv')
    df_weight_log_info.drop(columns=['IsManualReport', 'LogId','Fat'], inplace=True)

    # clean dataset 
    df_weight_log_info.rename(columns={'Date':'timestamp'},inplace=True)
    print('Head')
    df_weight_log_info.head()
    df_weight_log_info = filterDataset(df_weight_log_info)
    df_weight_log_info = reduceMemoryUsage(df_weight_log_info)
    df_weight_log_info = parseDateTime(df_weight_log_info)
    df_weight_log_info.columns=['id','weight_kg', 'weight_pounds', 'bmi','date','time']
    
    ## create final intended synthetic dataset
    df_manual_input = pd.DataFrame(columns = ['id', 'date', 'time', 'weight_kg', 'weight_pounds', 'fat', 'bmi', 'calorie_consumption','doctor_visit', 'symptom_code', 'blood_pressure', 'glucose_morning','glucose_evening','mental_health_code'])
    
    # populate manual_input with ids and date data
    start_date = '2016-04-12'
    end_date = '2016-05-12'
    num_days = (datetime(2016, 5, 12) - datetime(2016, 4, 12)).days 
    num_days += 1

    log_info = list(df_weight_log_info['id'].unique())
    print(log_info)
    df_manual_input = pd.DataFrame({
        'id': pd.Series(log_info).repeat(num_days),
        'date': pd.date_range(start=start_date, end=end_date, freq='D').repeat(len(log_info)),
        'fat': [0] * (num_days * len(log_info)), 
        'calorie_consumption': [0] * (num_days * len(log_info)),
        'doctor_visit': [False] * (num_days * len(log_info)), 
        'symptom_code': [0] * (num_days * len(log_info)), 
        'blood_pressure': [0] * (num_days * len(log_info)), 
        'glucose_morning': [0] * (num_days * len(log_info)),
        'glucose_evening': [0] * (num_days * len(log_info)),
        'mental_health_code': [0] * (num_days * len(log_info))
    })
    
    # aligning datatypes
    df_manual_input['date'] = pd.to_datetime(df_manual_input['date'])
    df_weight_log_info['date'] = pd.to_datetime(df_weight_log_info['date'])
    
    # Place cleaned weight_log_info into the manual_input df
    df_manual_input = pd.merge(df_weight_log_info,df_manual_input,on=['id','date'],how='outer')
    interim_data_path = f'{os.getcwd()}/data_interim/'
    df_manual_input.to_csv(f'{interim_data_path}manual_input.csv', index=False)
    print(df_manual_input.head())
    
    df_manual_input.info()
    ## Generate Synthetic Dataset
    # define constraints for dataset
    
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df_manual_input) 
    
    # # dictionary corresponding to the metadata
    python_dict = metadata.to_dict()
    metadata.visualize(
        show_table_details='full',
        output_filepath='visualizations/manualInput.png'    
    )
    metadata.validate()
    metadata.validate_data(data=df_manual_input)
    
    # update columnn types
    metadata.update_column(
        column_name='id',
        sdtype='id',
    )   
    metadata.update_column(
        column_name='doctor_visit',
        sdtype='boolean',
    )   
    
    # update the datatype visualization
    metadata.visualize(
        show_table_details='full',
        output_filepath='visualizations/manualInput.png'    
    )
    
    # save as json for future use 
    metadata.save_to_json(filepath='metadata/metadata_manual_input.json')
    
    

In [33]:
generateManualInputMetadata()

Head
Filter Dataset
           Id              timestamp    WeightKg  WeightPounds        BMI
0  1503960366   5/2/2016 11:59:59 PM   52.599998    115.963147  22.650000
1  1503960366   5/3/2016 11:59:59 PM   52.599998    115.963147  22.650000
2  1927972279   4/13/2016 1:08:52 AM  133.500000    294.317120  47.540001
3  2873212765  4/21/2016 11:59:59 PM   56.700001    125.002104  21.450001
4  2873212765  5/12/2016 11:59:59 PM   57.299999    126.324875  21.690001
Column Names: Id                int64
timestamp        object
WeightKg        float64
WeightPounds    float64
BMI             float64
dtype: object
The sum of null values are: Id              0
timestamp       0
WeightKg        0
WeightPounds    0
BMI             0
dtype: int64
Count of cells BEFORE dropping null: 335 

Count of cells AFTER dropping null: 335 

------------------------------------------------------------------
Reduce Memory
Starting memory usage is 0.00711
Memory usage decreased to 0.00589 Mb (17.08496% reduction)

Step 4: Complete the Manual Input dataframe generation - Modeling Phase

In [24]:
def fillManualInputMetadata():
    metadata = SingleTableMetadata.load_from_json(filepath='metadata/metadata_manual_input.json')
    
    interim_data_path = f'{os.getcwd()}/data_interim/'
    df_manual_input = pd.read_csv(interim_data_path+'manual_input.csv')
    # anonymize digital data for sharing purposes
    # anonymized_metadata = metadata.anonymize()
    
    # create and train the synthesizer
    s = GaussianCopulaSynthesizer(
        metadata=metadata,
        enforce_min_max_values=True, 
        enforce_rounding=False,
        numerical_distributions={
        'id': 'norm',
        'date': 'norm',
        'time': 'norm',
        'weight_kg': 'uniform',
        'weight_pounds': 'uniform',
        'fat': 'norm',
        'bmi': 'norm',
        'calorie_consumption': 'norm',
        'doctor_visit': 'norm',
        'symptom_code': 'norm',
        'blood_pressure': 'norm',
        'glucose_morning': 'norm',
        'glucose_evening': 'norm',
        'mental_health_code': 'norm'
        },
        default_distribution='norm'
        )
    s.fit(df_manual_input)
    s_data = s.sample(num_rows=10) # test with 10 rows
    
    # get the metadata with s.get_metadata()
    # view the shape estimates based on the distribution
    s.get_learned_distributions()
    # # set constraints
    # constraints = [
    # {
    # 'constraint_class': 'ScalarInequality',
    # 'constraint_parameters': {
    #     'column_name': 'date',
    #     'relation': '>=',
    #     'value': '2016-04-12'
    #     }   
    # },
    # {
    # 'constraint_class': 'ScalarInequality',
    # 'constraint_parameters': {
    #     'column_name': 'date',
    #     'relation': '<=',
    #     'value': '2016-05-12'
    #     }   
    # },
    # {
    # 'constraint_class': 'FixedIncrements',
    # 'constraint_parameters': {
    #     'column_name': 'date',  # Adjust the column name accordingly
    #     'increment': '1 day'  # Adjust the increment as needed
    #     }  
    # },
    # {
    # 'constraint_class': 'ScalarInequality',
    # 'constraint_parameters': {
    #     'column_name': 'weight_pounds',  # Adjust the column name accordingly
    #     'relation': 'between',  # Define the relation as 'between'
    #     'low_value': -15,  # Set the lower bound as -15 pounds or kilograms
    #     'high_value': 15  # Set the upper bound as 15 pounds or kilograms
    #     }
    # },
    #     {
    # 'constraint_class': 'ScalarInequality',
    # 'constraint_parameters': {
    #     'column_name': 'weight_kg',  # Adjust the column name accordingly
    #     'relation': 'between',  # Define the relation as 'between'
    #     'low_value': -6.803885,  # Set the lower bound as -15 pounds or kilograms
    #     'high_value': -6.803885  # Set the upper bound as 15 pounds or kilograms
    #     }
    # },
    # {
    # 'constraint_class': 'ScalarRange',
    # 'constraint_parameters': {
    #     'column_name': 'weight_kg',
    #     'low_value': 54.4311,  # 120 lbs converted to kg
    #     'high_value': 136.0777  # 300 lbs converted to kg
    # }
    # },
    # {
    #     'constraint_class': 'ScalarRange',
    #     'constraint_parameters': {
    #         'column_name': 'weight_pounds',
    #         'low_value': 120,
    #         'high_value': 300
    #     }
    # },
    # {
    #     'constraint_class': 'ScalarRange',
    #     'constraint_parameters': {
    #         'column_name': 'blood_pressure',
    #         'low_value': 90,  # Typical diastolic pressure
    #         'high_value': 120  # Typical systolic pressure
    #     }
    # }

    # ]
    # s.add_constraints(constraints=constraints)
    s.auto_assign_transformers(df_manual_input)
    print(s.get_transformers())
    
    # preprocess data using the transformation
    processed_data = s.preprocess(df_manual_input)
    s.fit_processed_data(processed_data=processed_data)
    s.save(filepath='visualizations/manual_input.pkl')
    
    # save 300 samples to csv
    synthetic_data = s.sample(num_rows = 300)
    synthetic_data.to_csv('data_interim/synthetic_data.csv', index=False)
    
    
    # load the synthesizer
    # s = GaussianCopulaSynthesizer.load(filepath='synthesizer_manual_input.pkl')
    

In [25]:
fillManualInputMetadata()

{'id': AnonymizedFaker(function_name='bothify', function_kwargs={'text': '#####'}), 'weight_kg': FloatFormatter(enforce_min_max_values=True), 'weight_pounds': FloatFormatter(enforce_min_max_values=True), 'bmi': UniformEncoder(), 'date': UnixTimestampEncoder(datetime_format='%Y-%m-%d', enforce_min_max_values=True), 'time': UniformEncoder(), 'fat': UniformEncoder(), 'calorie_consumption': UniformEncoder(), 'doctor_visit': UniformEncoder(), 'symptom_code': UniformEncoder(), 'blood_pressure': UniformEncoder(), 'glucose_morning': UniformEncoder(), 'glucose_evening': UniformEncoder(), 'mental_health_code': UniformEncoder()}




Step 5. Generate ManualInputDataset.csv

In [18]:
def generateManualInput():
    ids = [150390366, 1927972279, 2873212765, 4319703577, 4558609924, 5577150313, 6962181067, 8877689391]
    start_date = datetime(2016, 4, 12)
    end_date = datetime(2016, 5, 12)
    date_range = pd.date_range(start=start_date, end=end_date)

    data = []
    for _id in ids:
        initial_kg = random.uniform(50, 120)
        initial_bmi = random.uniform(18.5, 30)
        initial_fat = random.uniform(15, 35)
        direction_two = 0
        for date in date_range:
            weight_kg = initial_kg + direction_two
            weight_pounds = initial_kg * 2.20462
            bmi = initial_bmi + direction_two
            timestamp = date
            fat = initial_fat + direction_two
            calorie_consumption = random.randint(1000, 3000)
            doctor_visit = random.choice([True, False])
            symptom_code = random.choice(['Nausea and Vomiting','Fatigue','None','Headaches','Back Pain','Swelling in Extremities','Heartburn','Constipation','None','Frequent Urination','Braxton Hicks Contractions','Round Ligament Pain', 'None'])
            blood_pressure = f"{random.randint(90, 120)}/{random.randint(60, 80)}"
            glucose_morning = random.randint(70, 110)
            glucose_evening = random.randint(70, 110)
            mental_health_code = random.choice( ['Anxiety','Depression','None','Stress','Mood Swings','Insomnia','Postpartum Depression','Adjustment Disorder','None','Pregnancy-related OCD','Body Image Issues','Relationship Strain', 'Other', 'None'])

            data.append([_id, weight_kg, weight_pounds, bmi, timestamp, fat, calorie_consumption, 
                         doctor_visit, symptom_code, blood_pressure, glucose_morning, 
                         glucose_evening, mental_health_code])
            
            direction_two = random.choice([-2, -1, 0, 1, 2])

    df_manual_input = pd.DataFrame(data, columns=['id', 'weight_kg', 'weight_pounds', 'bmi', 'timestamp', 
                                     'fat', 'calorie_consumption', 'doctor_visit', 'symptom_code', 
                                     'blood_pressure', 'glucose_morning', 'glucose_evening', 
                                     'mental_health_code'])

    
    print(df_manual_input.head())
    df_manual_input.to_csv('data_interim/manual_input.csv', index=False)


In [19]:
generateManualInput()

          id   weight_kg  weight_pounds       bmi  timestamp        fat  \
0  150390366  110.519697     243.653935  24.81896 2016-04-12  18.969158   
1  150390366  108.519697     243.653935  22.81896 2016-04-13  16.969158   
2  150390366  108.519697     243.653935  22.81896 2016-04-14  16.969158   
3  150390366  109.519697     243.653935  23.81896 2016-04-15  17.969158   
4  150390366  110.519697     243.653935  24.81896 2016-04-16  18.969158   

   calorie_consumption  doctor_visit                symptom_code  \
0                 2399         False                        None   
1                 1890          True          Frequent Urination   
2                 1250         False                        None   
3                 2741         False  Braxton Hicks Contractions   
4                 2659          True                        None   

  blood_pressure  glucose_morning  glucose_evening     mental_health_code  
0          92/78               78               93  Postpartum D