In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np

from tasrif.processing_pipeline import ProcessingPipeline
from tasrif.processing_pipeline.pandas import ConvertToDatetimeOperator, SortOperator, ReplaceOperator
from tasrif.processing_pipeline.pandas import DropDuplicatesOperator, DropNAOperator, DropFeaturesOperator
from tasrif.processing_pipeline.custom import OneHotEncoderOperator

from tasrif.data_readers.sleep_health import MyFamilyDataset


In [2]:
def col_stats(df):
    print('Some important stats:')
    print('\t- This dataset contains', len(df) ,' rows.')
    for col in df.columns:
        null_percentage = "{:.2f}".format(df[col].isnull().sum()/len(df)*100)
        print('\t - ``', col, '`` has', df[col].isnull().sum(), 'NAs (', df[col].count().sum(), '/', len(df), ') =',
              null_percentage, '%')

In [3]:
# Full MyFamilyDataset
mf = MyFamilyDataset(shc_folder="../../data/sleephealth/", pipeline=None)
df = mf.raw_df.copy()
print("Shape:", df.shape)
df.head()

Shape: (3003, 6)


Unnamed: 0,participantId,fam_history,family_size,language,underage_family,timestamp
0,6da7e848-daaa-410c-a83f-35b63498595c,15,3.0,1.0,2.0,2016-03-02T00:08:56-05:00
1,8fbc2a05-b7d2-41b6-a111-7b38006c86f5,125,2.0,1.0,1.0,2016-04-30T00:09:26-04:00
2,5f4851c0-149d-45ca-8c30-fafbfe2684dc,100,2.0,1.0,1.0,2016-03-04T08:56:13-06:00
3,bff5f14d-51b4-4162-9031-6132a7ffdf68,100,5.0,1.0,2.0,2016-03-04T07:01:05-08:00
4,58817d18-21a9-4f39-bd42-85af388b8eef,100,5.0,1.0,4.0,2016-03-06T19:40:43-05:00


In [4]:
col_stats(df)

Some important stats:
	- This dataset contains 3003  rows.
	 - `` participantId `` has 0 NAs ( 3003 / 3003 ) = 0.00 %
	 - `` fam_history `` has 15 NAs ( 2988 / 3003 ) = 0.50 %
	 - `` family_size `` has 7 NAs ( 2996 / 3003 ) = 0.23 %
	 - `` language `` has 4 NAs ( 2999 / 3003 ) = 0.13 %
	 - `` underage_family `` has 9 NAs ( 2994 / 3003 ) = 0.30 %
	 - `` timestamp `` has 0 NAs ( 3003 / 3003 ) = 0.00 %


In [5]:
# Default Pipeline SleepQualityCheckerDataset
pipeline = ProcessingPipeline([
    ConvertToDatetimeOperator(feature_names="timestamp", format="%Y-%m-%dT%H:%M:%S%z", utc=True),
    SortOperator(by=["participantId", "timestamp"]),
    DropDuplicatesOperator(subset="participantId", keep="last"),
    ReplaceOperator(to_replace={"fam_history": {"200": np.nan},
                                "family_size": {6: np.nan},
                                "language": {5: np.nan},
                                "underage_family": {6: np.nan}}),
    DropNAOperator(subset=["fam_history", "family_size", "language", "underage_family"]),
    OneHotEncoderOperator(feature_names=["fam_history", "family_size", "language", "underage_family"], drop_first=False),
    DropFeaturesOperator(["fam_history=200"])
    ])

mypipe = MyFamilyDataset(shc_folder="../../data/sleephealth/", pipeline=pipeline)
df_piped = mypipe.processed_dataframe()
print("Shape:", df_piped.shape)
df_piped.head()

Shape: (2695, 21)


Unnamed: 0,participantId,timestamp,fam_history=0,fam_history=1,fam_history=100,fam_history=2,fam_history=5,family_size=1.0,family_size=2.0,family_size=3.0,...,family_size=5.0,language=1.0,language=2.0,language=3.0,language=4.0,underage_family=1.0,underage_family=2.0,underage_family=3.0,underage_family=4.0,underage_family=5.0
2341,0018174a-1f74-49f5-ba81-9cba70401b1f,2017-05-16 13:02:58+00:00,1,1,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
2927,00a3be71-6b4c-46c5-9e50-acf2db62f27b,2019-02-01 15:11:11+00:00,1,1,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
509,00a55fb5-da33-4e2e-ae61-28f589fcc174,2016-05-13 11:08:09+00:00,1,1,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
794,00c13261-dd38-4730-90c4-beb25ff35822,2016-07-24 16:20:08+00:00,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
1446,00dbfe89-8c89-4933-9e84-bb8624787026,2016-03-29 13:55:35+00:00,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
