In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from src.features import calc_hr_zone, calc_calories_burning_intensity

### Import preprocessed dataset
*Dataset processed in preprocessing.ipynb*

In [3]:
df_activities = pd.read_csv("data_files/activities_silver.csv")

* Display data

In [4]:
df_activities

Unnamed: 0,activity_type,distance,calories,avg_hr,avg_speed,elapsed_time,indoor_activity
0,cycling,5.25,169,128.0,3.81,4956.0,0
1,cycling,7.08,183,107.0,5.44,4685.0,0
2,cycling,0.00,454,141.0,0.00,3310.0,1
3,running,7.02,437,161.0,9.55,2646.0,0
4,cycling,4.00,85,101.0,3.06,4701.0,0
...,...,...,...,...,...,...,...
259,walking,5.04,226,99.0,3.66,4951.0,1
260,cycling,17.08,304,132.0,12.70,4841.0,1
261,cycling,14.42,282,132.0,11.01,4716.0,1
262,walking,3.28,182,99.0,4.28,2756.0,1


* Create *hr_zone* colum, using *calc_hr_zone* function from src/features.py

In [5]:
df_activities['hr_zone'] = df_activities.apply(lambda row: calc_hr_zone(row['avg_hr']), axis=1)

In [6]:
df_activities

Unnamed: 0,activity_type,distance,calories,avg_hr,avg_speed,elapsed_time,indoor_activity,hr_zone
0,cycling,5.25,169,128.0,3.81,4956.0,0,2
1,cycling,7.08,183,107.0,5.44,4685.0,0,1
2,cycling,0.00,454,141.0,0.00,3310.0,1,3
3,running,7.02,437,161.0,9.55,2646.0,0,4
4,cycling,4.00,85,101.0,3.06,4701.0,0,1
...,...,...,...,...,...,...,...,...
259,walking,5.04,226,99.0,3.66,4951.0,1,1
260,cycling,17.08,304,132.0,12.70,4841.0,1,3
261,cycling,14.42,282,132.0,11.01,4716.0,1,3
262,walking,3.28,182,99.0,4.28,2756.0,1,1


* Create *burned_calories_per_minute* colum, using *calc_calories_burning_intensity* function from src/features.py

In [7]:
df_activities['burned_calories_per_minute'] = df_activities.apply(lambda row: calc_calories_burning_intensity(row['calories'], row['elapsed_time']), axis=1)

* Split dataset to features (X) and target value (y)

In [9]:
X_df_activities = df_activities[['activity_type', 'distance', 'avg_hr', 'avg_speed',
       'elapsed_time', 'indoor_activity', 'hr_zone',
       'burned_calories_per_minute']]
y_df_activities = df_activities['calories']

* One Hot Encoding - transform string type column to binary data 

In [10]:
X_df_activities = pd.get_dummies(X_df_activities, columns=['activity_type'], drop_first=True).astype(int)

In [11]:
X_df_activities

Unnamed: 0,distance,avg_hr,avg_speed,elapsed_time,indoor_activity,hr_zone,burned_calories_per_minute,activity_type_running,activity_type_walking
0,5,128,3,4956,0,2,2,0,0
1,7,107,5,4685,0,1,2,0,0
2,0,141,0,3310,1,3,8,0,0
3,7,161,9,2646,0,4,9,1,0
4,4,101,3,4701,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...
259,5,99,3,4951,1,1,2,0,1
260,17,132,12,4841,1,3,3,0,0
261,14,132,11,4716,1,3,3,0,0
262,3,99,4,2756,1,1,3,0,1


* Split data to train and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_df_activities, y_df_activities, test_size=0.2, random_state=42)

* Scale numeric columns 

In [13]:
cols_to_scale = ['distance', 'avg_hr', 'avg_speed', 'elapsed_time', 'burned_calories_per_minute']
scaler = StandardScaler()

* Train scaler to get standard deviation and mean, but only on X_train

In [14]:
scaler.fit(X_train[cols_to_scale])

0,1,2
,copy,True
,with_mean,True
,with_std,True


* Transform train and test datasets using scaler

In [15]:
X_train[cols_to_scale] = scaler.transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

* Export final datasets to CSV file

In [17]:
X_train.to_csv("data_files/X_train.csv", index=False)
X_test.to_csv("data_files/X_test.csv", index=False)
y_train.to_csv("data_files/y_train.csv", index=False)
y_test.to_csv("data_files/y_test.csv", index=False)