# Data Normalisation

Using Z-Normalisation



In [2]:
import pandas as pd

# import data
folder_path = 'C:\\Users\\aoife\Documents\\Project\\filtered_data\\'

df = pd.read_csv(folder_path + '23_02_activity_and_survey_data.csv', usecols=['participantId', 'bodyMass_kg', 'height_m', 'bmi', 'mean_hr/s', 'hr_duration_(s)', 'max_hr', 'min_hr', 'totalSteps', 'steps_per_hour', 'totalDistance', 'dist_per_hour', 'ssq_score', 'alcohol_consumption', 'basic_expenses', 'caffeine', 'daily_activities', 'daily_smoking', 'education', 'flexible_work_hours', 'gender', 'good_life', 'hispanic','income', 'marital', 'race', 'smoking_status', 'menopause', 'recent_births', 'current_pregnant', 'work_schedule', 'alarm_dependency', 'driving_sleepy', 'falling_asleep', 'morning_person', 'nap_duration', 'sleep_lost', 'sleep_needed', 'sleep_partner', 'sleep_time_workday', 'sleep_time_weekend', 'wake_up_choices', 'wake_ups', 'weekly_naps', 'noise_light', 'stress_thinking', 'other_person', 'pain_discomfort', 'nightmares', 'bathroom_urges', 'other_reasons'])


In [3]:
# define minimum and maximum values for each feature
# for versions of dataset where 'unknown', and 'prefer not to say' are removed

min_max_values = {
    'bodyMass_kg': {'min': 0, 'max': 635},
    'height_m': {'min': 0, 'max': 2.72},
    'bmi': {'min': 0, 'max': 204},
    'mean_hr/s': {'min': 0, 'max': 4},
    'hr_duration_(s)': {'min': 0, 'max': 86400},
    'max_hr': {'min': 0, 'max': 4},
    'min_hr': {'min': 0, 'max': 4},
    'totalSteps': {'min': 0, 'max': 238000},
    'steps_per_hour': {'min': 0, 'max': 31835},
    'totalDistance': {'min': 0, 'max': 160000},
    'dist_per_hour': {'min': 0, 'max': 21330},
    'ssq_score': {'min': 1, 'max': 5},
    'alcohol_consumption': {'min': 0, 'max': 5},
    'basic_expenses': {'min': 1, 'max': 4},
    'caffeine': {'min': 0, 'max': 60},
    'daily_activities': {'min': 1, 'max': 9},
    'daily_smoking': {'min': 1, 'max': 3},
    'education': {'min': 1, 'max': 6},
    'flexible_work_hours': {'min': 1, 'max': 2},
    'gender': {'min': 1, 'max': 2},
    'good_life': {'min': 1, 'max': 2},
    'hispanic': {'min': 1, 'max': 6},
    'income': {'min': 1, 'max': 7},
    'marital': {'min': 1, 'max': 6},
    'smoking_status': {'min': 1, 'max': 9},
    'menopause': {'min': 1, 'max': 3},
    'recent_births': {'min': 1, 'max': 4},
    'current_pregnant': {'min': 0, 'max': 2},  
    'work_schedule': {'min': 1, 'max': 6},
    'alarm_dependency': {'min': 1, 'max': 4},
    'driving_sleepy': {'min': 1, 'max': 5},
    'falling_asleep': {'min': 1, 'max': 7},
    'morning_person': {'min': 1, 'max': 2},
    'nap_duration': {'min': 1, 'max': 7},
    'sleep_lost': {'min': 0, 'max': 1440},
    'sleep_needed': {'min': 0, 'max': 24},
    'sleep_partner': {'min': 1, 'max': 6},
    'sleep_time_workday': {'min': 0, 'max': 24},
    'sleep_time_weekend': {'min': 0, 'max': 24},
    'wake_up_choices': {'min': 1, 'max': 10},
    'wake_ups': {'min': 0, 'max': 30},
    'weekly_naps': {'min': 1, 'max': 5},
    'noise_light': {'min': 0, 'max': 1},
    'stress_thinking': {'min': 0, 'max': 1},
    'other_person': {'min': 0, 'max': 1},
    'pain_discomfort': {'min': 0, 'max': 1},
    'nightmares': {'min': 0, 'max': 1},
    'bathroom_urges': {'min': 0, 'max': 1},
    'other_reasons': {'min': 0, 'max': 1}
}




In [4]:
# function for min-max normalisation

def min_max_normalisation(series, min_val, max_val):
    return (series - min_val) / (max_val - min_val)

In [5]:
# Call function on each feature/column

for feature in min_max_values:
    minimum = min_max_values[feature]['min']
    maximum = min_max_values[feature]['max']
    df[feature] = min_max_normalisation(df[feature], minimum, maximum)

In [6]:
print(df.head(10))

                          participantId  bodyMass_kg  height_m       bmi  \
0  00a55fb5-da33-4e2e-ae61-28f589fcc174     0.080718  0.578971  0.101313   
1  00fd4039-9b5e-4bbb-8295-4983a3f58371     0.094290  0.606985  0.107675   
2  02d5125e-684f-4166-a3b7-5df1bcfc1661     0.094290  0.625662  0.101342   
3  080292d4-a0b0-4dd7-a7dd-191c8ac71664     0.121435  0.663015  0.116226   
4  080bacc1-4661-4735-acca-7d27ad1a4192     0.142863  0.663015  0.136735   
5  0b79acd2-ea0b-406f-aa41-e48b98f19bb4     0.114291  0.616324  0.126591   
6  0f00f803-6e81-4b77-9efc-3be73b5bface     0.155357  0.681691  0.140657   
7  0f279ffd-a1b5-473f-9cfc-01e5510ea5a4     0.175723  0.663015  0.168185   
8  156fe69a-1c98-4337-b310-aee985e94279     0.095718  0.597647  0.112748   
9  1aaad641-a3b6-4e47-a010-e8cb01ce3d03     0.129150  0.644338  0.130879   

   mean_hr/s  hr_duration_(s)   max_hr   min_hr  totalSteps  steps_per_hour  \
0   0.359606         0.000544  0.44575  0.25425    0.018714        0.079119   
1   0

In [7]:
# Save to csv

df.to_csv(folder_path + 'minmax_normalised_data.csv', index=False)