In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np

from tasrif.processing_pipeline import ProcessingPipeline
from tasrif.processing_pipeline.pandas import ConvertToDatetimeOperator, SortOperator, ReplaceOperator
from tasrif.processing_pipeline.pandas import DropDuplicatesOperator, DropNAOperator, DropFeaturesOperator
from tasrif.processing_pipeline.custom import OneHotEncoderOperator

from tasrif.data_readers.sleep_health import ResearchInterestDataset


In [2]:
def col_stats(df):
    print('Some important stats:')
    print('\t- This dataset contains', len(df) ,' rows.')
    for col in df.columns:
        null_percentage = "{:.2f}".format(df[col].isnull().sum()/len(df)*100)
        print('\t - ``', col, '`` has', df[col].isnull().sum(), 'NAs (', df[col].count().sum(), '/', len(df), ') =',
              null_percentage, '%')

In [3]:
# Full MyFamilyDataset
mf = ResearchInterestDataset(shc_folder="../../data/sleephealth/", pipeline=None)
df = mf.raw_df.copy()
print("Shape:", df.shape)
df.head()

Shape: (2359, 9)


Unnamed: 0,participantId,contact_method,research_experience,two_surveys_perday,blood_sample,taking_medication,family_survey,hospital_stay,timestamp
0,6da7e848-daaa-410c-a83f-35b63498595c,12,1.0,2.0,2.0,2.0,2.0,2.0,2016-03-03T06:55:58-05:00
1,710f0db5-0f76-4cbf-a073-33522eafe998,1,2.0,2.0,1.0,1.0,1.0,1.0,2016-03-11T05:48:13+08:00
2,f2514967-9173-4834-96f7-0acdd0298e84,1,3.0,2.0,1.0,1.0,1.0,2.0,2016-03-03T07:21:59-08:00
3,cd9cf981-4a00-4915-8ca3-164bf6550b5b,12,1.0,2.0,2.0,2.0,2.0,2.0,2016-03-03T16:17:02-06:00
4,9ecd89b1-7a6b-4abc-8b76-7d5f884d8328,1,2.0,2.0,1.0,2.0,1.0,1.0,2016-03-20T19:41:07Z


In [4]:
col_stats(df)

Some important stats:
	- This dataset contains 2359  rows.
	 - `` participantId `` has 0 NAs ( 2359 / 2359 ) = 0.00 %
	 - `` contact_method `` has 8 NAs ( 2351 / 2359 ) = 0.34 %
	 - `` research_experience `` has 4 NAs ( 2355 / 2359 ) = 0.17 %
	 - `` two_surveys_perday `` has 8 NAs ( 2351 / 2359 ) = 0.34 %
	 - `` blood_sample `` has 6 NAs ( 2353 / 2359 ) = 0.25 %
	 - `` taking_medication `` has 7 NAs ( 2352 / 2359 ) = 0.30 %
	 - `` family_survey `` has 10 NAs ( 2349 / 2359 ) = 0.42 %
	 - `` hospital_stay `` has 11 NAs ( 2348 / 2359 ) = 0.47 %
	 - `` timestamp `` has 0 NAs ( 2359 / 2359 ) = 0.00 %


In [5]:
print("Shape after dropping duplicate participants:", df["participantId"].drop_duplicates().shape)

Shape after dropping duplicate participants: (2192,)


In [15]:
# Default Pipeline
pipeline = ProcessingPipeline([
    ConvertToDatetimeOperator(feature_names="timestamp", format="%Y-%m-%dT%H:%M:%S%z", utc=True),
    SortOperator(by=["participantId", "timestamp"]),
    DropDuplicatesOperator(subset="participantId", keep="last"),
    ReplaceOperator(to_replace={"research_experience": {3: np.nan}}),
    DropNAOperator(subset=['contact_method', 'research_experience',  'two_surveys_perday',
                           'blood_sample', 'taking_medication', 'family_survey', 'hospital_stay']),
    OneHotEncoderOperator(feature_names=['contact_method', 'research_experience',  'two_surveys_perday',
                           'blood_sample', 'taking_medication', 'family_survey', 'hospital_stay'],
                           drop_first=True),
    ])

mypipe = ResearchInterestDataset(shc_folder="../../data/sleephealth/", pipeline=pipeline)
df_piped = mypipe.processed_dataframe()
print("Shape:", df_piped.shape)
df_piped.head()




Shape: (2072, 21)


Unnamed: 0,participantId,timestamp,contact_method=2,contact_method=3,contact_method=4,contact_method=5,contact_method=6,contact_method=7,contact_method=8,contact_method=9,...,two_surveys_perday=2.0,two_surveys_perday=3.0,blood_sample=2.0,blood_sample=3.0,taking_medication=2.0,taking_medication=3.0,family_survey=2.0,family_survey=3.0,hospital_stay=2.0,hospital_stay=3.0
2279,00a3be71-6b4c-46c5-9e50-acf2db62f27b,2019-02-05 15:59:17+00:00,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,1,0
1120,00c13261-dd38-4730-90c4-beb25ff35822,2016-07-24 16:20:51+00:00,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
431,00d1fe00-fa24-4dcf-a8e8-baafab0cf945,2016-03-10 04:51:42+00:00,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,0,0
1185,00dbfe89-8c89-4933-9e84-bb8624787026,2016-03-31 01:13:50+00:00,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
2099,00fd4039-9b5e-4bbb-8295-4983a3f58371,2018-08-17 12:26:52+00:00,1,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
