# Import and data type correction
> Importing libraries and correcting datatypes of the data columns


In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from datetime import datetime, timedelta
import pickle
import glob
from sklearn.impute import KNNImputer

!pip install reportlab pillow
from reportlab.platypus import SimpleDocTemplate, Paragraph, Image, Spacer, Table, TableStyle
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors

drive.mount('/content/gdrive/', force_remount=True)
os.chdir("/content/gdrive/Shareddrives/STA 221/data")

Mounted at /content/gdrive/


# Converting all activities columns (`activity-*`) in log transformed dataset to one hot encoded format

## kNN Imputed dataset

In [None]:
df = pd.read_csv("train_knn_imputed_final.csv", low_memory=False)
# df

In [None]:
# open and read the file
with open('activities.txt', 'r') as file:
    # read activities into a list, stripping newline characters
    activities = [line.strip().replace(' ', '_') for line in file.readlines()]

print(activities)

# adding 'No_Activity' to the list
activities.append('No_Activity')

# print the updated list of activities
print("\nUpdated Activities:")
print(activities)
print(len(activities))


['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running']

Updated Activities:
['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running', 'No_Activity']
23


In [None]:
# getting all `activity-*` columns
activity_columns = [col for col in df.columns if col.startswith('activity-')]

# one-hot encode each activity column
for col in activity_columns:
    # create a one-hot encoded dataframe with all possible activities in the correct order
    one_hot_encoded = pd.DataFrame(0, index=df.index, columns=[f"{col}_{activity}" for activity in activities])

    # fill in the one-hot values for the present activities
    for activity in activities:
        one_hot_encoded[f"{col}_{activity}"] = (df[col].replace(' ', '_') == activity).astype(int)

    # concatenate the one-hot encoded columns to the dataframe
    df = pd.concat([df, one_hot_encoded], axis=1)

    # drop the original activity column
    df.drop(columns=[col], inplace=True)

# print the updated dataFrame shape
print(f"DataFrame shape after one-hot encoding: {df.shape}")

# save the updated DataFrame (optional)
df.to_csv('train_knn_imputed_final_with_1hot_encoding.csv', index=False)
print("Updated dataset saved as 'train_knn_imputed_final_with_1hot_encoding.csv'")
df

DataFrame shape after one-hot encoding: (177024, 2093)
Updated dataset saved as 'train_knn_imputed_final_with_1hot_encoding.csv'


Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:00_Tennis,activity-0:00_Workout,activity-0:00_Hike,activity-0:00_Zumba,activity-0:00_Sport,activity-0:00_Yoga,activity-0:00_Swimming,activity-0:00_Weights,activity-0:00_Running,activity-0:00_No_Activity
0,p01_0,p01,2024-11-25 06:10:00,2.076938,2.199444,2.312535,2.236445,2.195000,2.322388,2.362739,...,0,0,0,0,0,0,0,0,0,0
1,p01_1,p01,2024-11-25 06:25:00,2.306577,2.324347,2.322388,2.276241,2.399712,2.272126,2.362739,...,0,0,0,0,0,0,0,0,0,0
2,p01_2,p01,2024-11-25 06:40:00,1.827770,2.056685,2.272126,1.865629,2.069391,2.219203,2.362739,...,0,0,0,0,0,0,0,0,0,0
3,p01_3,p01,2024-11-25 06:55:00,1.827770,2.177022,2.219203,2.020222,2.188296,2.186051,2.362739,...,0,0,0,0,0,0,0,0,0,0
4,p01_4,p01,2024-11-25 07:10:00,2.403335,2.118662,2.186051,2.358965,2.165619,2.151762,2.362739,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177019,p12_25294,p12,2024-11-25 23:35:00,2.230014,2.261763,2.272126,2.292535,2.332144,2.370244,2.388763,...,0,0,0,0,0,0,0,0,0,0
177020,p12_25295,p12,2024-11-25 23:40:00,2.261763,2.272126,2.292535,2.332144,2.370244,2.388763,2.379546,...,0,0,0,0,0,0,0,0,0,0
177021,p12_25296,p12,2024-11-25 23:45:00,2.272126,2.292535,2.332144,2.370244,2.388763,2.379546,2.360854,...,0,0,0,0,0,0,0,0,0,0
177022,p12_25297,p12,2024-11-25 23:50:00,2.292535,2.332144,2.370244,2.388763,2.379546,2.360854,2.351375,...,0,0,0,0,0,0,0,0,0,0


## Forward/Backward Fill Imputed dataset

In [None]:
df = pd.read_csv("train_ffill_bfill_final.csv", low_memory=False)
# df

In [None]:
# open and read the file
with open('activities.txt', 'r') as file:
    # read activities into a list, stripping newline characters
    activities = [line.strip().replace(' ', '_') for line in file.readlines()]

print(activities)

# adding 'No_Activity' to the list
activities.append('No_Activity')

# print the updated list of activities
print("\nUpdated Activities:")
print(activities)
print(len(activities))


['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running']

Updated Activities:
['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running', 'No_Activity']
23


In [None]:
# getting all `activity-*` columns
activity_columns = [col for col in df.columns if col.startswith('activity-')]

# one-hot encode each activity column
for col in activity_columns:
    # create a one-hot encoded dataframe with all possible activities in the correct order
    one_hot_encoded = pd.DataFrame(0, index=df.index, columns=[f"{col}_{activity}" for activity in activities])

    # fill in the one-hot values for the present activities
    for activity in activities:
        one_hot_encoded[f"{col}_{activity}"] = (df[col].replace(' ', '_') == activity).astype(int)

    # concatenate the one-hot encoded columns to the dataframe
    df = pd.concat([df, one_hot_encoded], axis=1)

    # drop the original activity column
    df.drop(columns=[col], inplace=True)

# print the updated dataFrame shape
print(f"DataFrame shape after one-hot encoding: {df.shape}")

# save the updated DataFrame (optional)
df.to_csv('train_ffill_bfill_final_with_1hot_encoding.csv', index=False)
print("Updated dataset saved as 'train_ffill_bfill_final_with_1hot_encoding.csv'")
df

DataFrame shape after one-hot encoding: (177024, 2093)
Updated dataset saved as 'train_ffill_bfill_final_with_1hot_encoding.csv'


Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:00_Tennis,activity-0:00_Workout,activity-0:00_Hike,activity-0:00_Zumba,activity-0:00_Sport,activity-0:00_Yoga,activity-0:00_Swimming,activity-0:00_Weights,activity-0:00_Running,activity-0:00_No_Activity
0,p01_0,p01,2024-11-25 06:10:00,2.163323,2.660260,2.312535,2.163323,2.631889,2.322388,2.163323,...,0,0,0,0,0,0,0,0,0,0
1,p01_1,p01,2024-11-25 06:25:00,2.163323,2.660260,2.322388,2.163323,2.631889,2.272126,2.163323,...,0,0,0,0,0,0,0,0,0,0
2,p01_2,p01,2024-11-25 06:40:00,2.163323,2.660260,2.272126,2.163323,2.631889,2.219203,2.163323,...,0,0,0,0,0,0,0,0,0,0
3,p01_3,p01,2024-11-25 06:55:00,2.163323,2.660260,2.219203,2.163323,2.631889,2.186051,2.163323,...,0,0,0,0,0,0,0,0,0,0
4,p01_4,p01,2024-11-25 07:10:00,2.163323,2.660260,2.186051,2.163323,2.631889,2.151762,2.163323,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177019,p12_25294,p12,2024-11-25 23:35:00,2.230014,2.261763,2.272126,2.292535,2.332144,2.370244,2.388763,...,0,0,0,0,0,0,0,0,0,0
177020,p12_25295,p12,2024-11-25 23:40:00,2.261763,2.272126,2.292535,2.332144,2.370244,2.388763,2.379546,...,0,0,0,0,0,0,0,0,0,0
177021,p12_25296,p12,2024-11-25 23:45:00,2.272126,2.292535,2.332144,2.370244,2.388763,2.379546,2.360854,...,0,0,0,0,0,0,0,0,0,0
177022,p12_25297,p12,2024-11-25 23:50:00,2.292535,2.332144,2.370244,2.388763,2.379546,2.360854,2.351375,...,0,0,0,0,0,0,0,0,0,0


# Converting all activities columns (`activity-*`) in original dataset to one hot encoded format

## kNN Imputed dataset

In [None]:
df = pd.read_csv("train_knn_imputed.csv", low_memory=False)
# df

In [None]:
# open and read the file
with open('activities.txt', 'r') as file:
    # read activities into a list, stripping newline characters
    activities = [line.strip().replace(' ', '_') for line in file.readlines()]

print(activities)

# adding 'No_Activity' to the list
activities.append('No_Activity')

# print the updated list of activities
print("\nUpdated Activities:")
print(activities)
print(len(activities))


['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running']

Updated Activities:
['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running', 'No_Activity']
23


In [None]:
# getting all `activity-*` columns
activity_columns = [col for col in df.columns if col.startswith('activity-')]

# one-hot encode each activity column
for col in activity_columns:
    # create a one-hot encoded dataframe with all possible activities in the correct order
    one_hot_encoded = pd.DataFrame(0, index=df.index, columns=[f"{col}_{activity}" for activity in activities])

    # fill in the one-hot values for the present activities
    for activity in activities:
        one_hot_encoded[f"{col}_{activity}"] = (df[col].replace(' ', '_') == activity).astype(int)

    # concatenate the one-hot encoded columns to the dataframe
    df = pd.concat([df, one_hot_encoded], axis=1)

    # drop the original activity column
    df.drop(columns=[col], inplace=True)

# print the updated dataFrame shape
print(f"DataFrame shape after one-hot encoding: {df.shape}")

# save the updated DataFrame (optional)
df.to_csv('train_knn_imputed_original_with_1hot_encoding.csv', index=False)
print("Updated dataset saved as 'train_knn_imputed_original_with_1hot_encoding.csv'")
df

DataFrame shape after one-hot encoding: (177024, 2093)
Updated dataset saved as 'train_knn_imputed_original_with_1hot_encoding.csv'


Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:00_Tennis,activity-0:00_Workout,activity-0:00_Hike,activity-0:00_Zumba,activity-0:00_Sport,activity-0:00_Yoga,activity-0:00_Swimming,activity-0:00_Weights,activity-0:00_Running,activity-0:00_No_Activity
0,p01_0,p01,2024-11-25 06:10:00,7.48,8.52,9.6,8.86,8.48,9.7,10.12,...,0,0,0,0,0,0,0,0,0,0
1,p01_1,p01,2024-11-25 06:25:00,9.54,9.72,9.7,9.24,10.52,9.2,10.12,...,0,0,0,0,0,0,0,0,0,0
2,p01_2,p01,2024-11-25 06:40:00,5.72,7.32,9.2,5.96,7.42,8.7,10.12,...,0,0,0,0,0,0,0,0,0,0
3,p01_3,p01,2024-11-25 06:55:00,5.72,8.32,8.7,7.04,8.42,8.4,10.12,...,0,0,0,0,0,0,0,0,0,0
4,p01_4,p01,2024-11-25 07:10:00,10.56,7.82,8.4,10.08,8.22,8.1,10.12,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177019,p12_25294,p12,2024-11-25 23:35:00,8.80,9.10,9.2,9.40,9.80,10.2,10.40,...,0,0,0,0,0,0,0,0,0,0
177020,p12_25295,p12,2024-11-25 23:40:00,9.10,9.20,9.4,9.80,10.20,10.4,10.30,...,0,0,0,0,0,0,0,0,0,0
177021,p12_25296,p12,2024-11-25 23:45:00,9.20,9.40,9.8,10.20,10.40,10.3,10.10,...,0,0,0,0,0,0,0,0,0,0
177022,p12_25297,p12,2024-11-25 23:50:00,9.40,9.80,10.2,10.40,10.30,10.1,10.00,...,0,0,0,0,0,0,0,0,0,0


## Forward/Backward Fill Imputed dataset

In [None]:
df = pd.read_csv("train_ffill_bfill.csv", low_memory=False)
# df

In [None]:
# open and read the file
with open('activities.txt', 'r') as file:
    # read activities into a list, stripping newline characters
    activities = [line.strip().replace(' ', '_') for line in file.readlines()]

print(activities)

# adding 'No_Activity' to the list
activities.append('No_Activity')

# print the updated list of activities
print("\nUpdated Activities:")
print(activities)
print(len(activities))


['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running']

Updated Activities:
['Indoor_climbing', 'Run', 'Strength_training', 'Swim', 'Bike', 'Dancing', 'Stairclimber', 'Spinning', 'Walking', 'HIIT', 'Outdoor_Bike', 'Walk', 'Aerobic_Workout', 'Tennis', 'Workout', 'Hike', 'Zumba', 'Sport', 'Yoga', 'Swimming', 'Weights', 'Running', 'No_Activity']
23


In [None]:
# getting all `activity-*` columns
activity_columns = [col for col in df.columns if col.startswith('activity-')]

# one-hot encode each activity column
for col in activity_columns:
    # create a one-hot encoded dataframe with all possible activities in the correct order
    one_hot_encoded = pd.DataFrame(0, index=df.index, columns=[f"{col}_{activity}" for activity in activities])

    # fill in the one-hot values for the present activities
    for activity in activities:
        one_hot_encoded[f"{col}_{activity}"] = (df[col].replace(' ', '_') == activity).astype(int)

    # concatenate the one-hot encoded columns to the dataframe
    df = pd.concat([df, one_hot_encoded], axis=1)

    # drop the original activity column
    df.drop(columns=[col], inplace=True)

# print the updated dataFrame shape
print(f"DataFrame shape after one-hot encoding: {df.shape}")

# save the updated DataFrame (optional)
df.to_csv('train_ffill_bfill_original_with_1hot_encoding.csv', index=False)
print("Updated dataset saved as 'train_ffill_bfill_original_with_1hot_encoding.csv'")
df

DataFrame shape after one-hot encoding: (177024, 2093)
