# Data Normalisation

Using Z-Normalisation



In [25]:
import pandas as pd

# import data
folder_path = 'C:\\Users\\aoife\Documents\\Project\\filtered_data\\'

df = pd.read_csv(folder_path + '23_02_activity_and_survey_data.csv', usecols=['participantId', 'bodyMass_kg', 'height_m', 'bmi', 'mean_hr/s', 'hr_duration_(s)', 'max_hr', 'min_hr', 'totalSteps', 'steps_per_hour', 'totalDistance', 'dist_per_hour', 'ssq_score', 'alcohol_consumption', 'basic_expenses', 'caffeine', 'daily_activities', 'daily_smoking', 'education', 'flexible_work_hours', 'gender', 'good_life', 'hispanic','income', 'marital', 'race', 'smoking_status', 'menopause', 'recent_births', 'current_pregnant', 'work_schedule', 'alarm_dependency', 'driving_sleepy', 'falling_asleep', 'morning_person', 'nap_duration', 'sleep_lost', 'sleep_needed', 'sleep_partner', 'sleep_time_workday', 'sleep_time_weekend', 'wake_up_choices', 'wake_ups', 'weekly_naps', 'noise_light', 'stress_thinking', 'other_person', 'pain_discomfort', 'nightmares', 'bathroom_urges', 'other_reasons'])


In [27]:
# define minimum and maximum values for each feature
# for versions of dataset where 'unknown', and 'prefer not to say' are removed

min_max_values = {
    'bodyMass_kg': {'min': 0, 'max': 635},
    'height_m': {'min': 0, 'max': 2.72},
    'bmi': {'min': 0, 'max': 204},
    'mean_hr/s': {'min': 0, 'max': 4},
    'hr_duration_(s)': {'min': 0, 'max': 86400},
    'max_hr': {'min': 0, 'max': 4},
    'min_hr': {'min': 0, 'max': 4},
    'totalSteps': {'min': 0, 'max': 238000},
    'steps_per_hour': {'min': 0, 'max': 31835},
    'totalDistance': {'min': 0, 'max': 160000},
    'dist_per_hour': {'min': 0, 'max': 21330},
    'ssq_score': {'min': 1, 'max': 5},
    'alcohol_consumption': {'min': 0, 'max': 5},
    'basic_expenses': {'min': 1, 'max': 4},
    'caffeine': {'min': 0, 'max': 60},
    'daily_activities': {'min': 1, 'max': 9},
    'daily_smoking': {'min': 1, 'max': 3},
    'education': {'min': 1, 'max': 6},
    'flexible_work_hours': {'min': 1, 'max': 2},
    'gender': {'min': 1, 'max': 2},
    'good_life': {'min': 1, 'max': 2},
    'hispanic': {'min': 1, 'max': 6},
    'income': {'min': 1, 'max': 7},
    'marital': {'min': 1, 'max': 6},
    'smoking_status': {'min': 1, 'max': 9},
    'menopause': {'min': 1, 'max': 3},
    'recent_births': {'min': 1, 'max': 4},
    'current_pregnant': {'min': 0, 'max': 2},  
    'work_schedule': {'min': 1, 'max': 6},
    'alarm_dependency': {'min': 1, 'max': 4},
    'driving_sleepy': {'min': 1, 'max': 5},
    'falling_asleep': {'min': 1, 'max': 7},
    'morning_person': {'min': 1, 'max': 2},
    'nap_duration': {'min': 1, 'max': 7},
    'sleep_lost': {'min': 0, 'max': 1440},
    'sleep_needed': {'min': 0, 'max': 24},
    'sleep_partner': {'min': 1, 'max': 6},
    'sleep_time_workday': {'min': 0, 'max': 24},
    'sleep_time_weekend': {'min': 0, 'max': 24},
    'wake_up_choices': {'min': 1, 'max': 10},
    'wake_ups': {'min': 0, 'max': 30},
    'weekly_naps': {'min': 1, 'max': 5},
    'noise_light': {'min': 0, 'max': 1},
    'stress_thinking': {'min': 0, 'max': 1},
    'other_person': {'min': 0, 'max': 1},
    'pain_discomfort': {'min': 0, 'max': 1},
    'nightmares': {'min': 0, 'max': 1},
    'bathroom_urges': {'min': 0, 'max': 1},
    'other_reasons': {'min': 0, 'max': 1}
}




In [28]:
# function for min-max normalisation

def min_max_normalisation(series, min_val, max_val):
    return (series - min_val) / (max_val - min_val)

In [29]:
# Call function on each feature/column

for feature in min_max_values:
    minimum = min_max_values[feature]['min']
    maximum = min_max_values[feature]['max']
    df[feature] = min_max_normalisation(df[feature], minimum, maximum)

In [30]:
print(df.head(10))

                            participantId  bodyMass_kg  height_m       bmi  \
23   358d6f00-a8ab-458f-a8c1-12048908b966     0.112863  0.644338  0.114375   
24   358d6f00-a8ab-458f-a8c1-12048908b966     0.112863  0.644338  0.114375   
45   3ec95a6f-9e70-4343-996e-2de5e0d66b15     0.153578  0.672353  0.142935   
356  b2a4c45f-86cf-478a-8420-fc1e77905b0c     0.200009  0.625662  0.214970   

     mean_hr/s  hr_duration_(s)   max_hr   min_hr  totalSteps  steps_per_hour  \
23    0.318439         0.000949  0.53750  0.24575    0.053752        0.151231   
24    0.563466         0.003542  0.72075  0.25000    0.029542        0.121110   
45    0.340086         0.000671  0.48325  0.28325    0.030105        0.045081   
356   0.398250         0.000359  0.53750  0.27500    0.004080        0.054765   

     ...  wake_up_choices  wake_ups  weekly_naps  noise_light  \
23   ...         0.555556  0.000000         0.00          0.0   
24   ...         0.555556  0.000000         0.00          0.0   
45   ...

In [31]:
# Save to csv

df.to_csv(folder_path + 'minmax_normalised_data.csv', index=False)