In [1]:
# Add src folder to Python path so db_config can be imported
import sys
import os

sys.path.append(os.path.abspath("../src"))

In [2]:
# Used to execute SQL queries and show results
from sqlalchemy import text
from db_config import engine
import pandas as pd

# Table record counts


In [None]:

def record_count(table):
    with engine.connect() as conn:
        result = conn.execute(
            text(f"SELECT COUNT(*) FROM {table}")
        ).scalar()
    return  result

#### we have written  a dynamic function which will take table name as a parameter and return count of records

In [13]:
record_count('sleepday_merged')

413

In [None]:


def get_all_tables():
    with engine.connect() as conn:
        result = conn.execute(
            text("""
                SELECT table_name
                FROM information_schema.tables
                WHERE table_schema = :db_name
                  AND table_type = 'BASE TABLE'
            """),
            {"db_name": "fitbit_db"}  
        )
        return [row[0] for row in result]


####  get_all_tables function  will return all  existing tables in the fitbit_db database

In [22]:
tables=get_all_tables()

>* we have stored all tables in a variable tables 
>* we used for loop to through tables and printed count of records

In [28]:
dict={}
for table in tables:
    res=record_count(table)
    dict[table]=res
    print(f'{table} :  {res}')
    print('-'*40)



dailyactivity_merged :  940
----------------------------------------
dailycalories_merged :  940
----------------------------------------
dailyintensities_merged :  940
----------------------------------------
dailysteps_merged :  940
----------------------------------------
heartrate_seconds_merged :  2483658
----------------------------------------
hourlycalories_merged :  22099
----------------------------------------
hourlyintensities_merged :  22099
----------------------------------------
hourlysteps_merged :  22099
----------------------------------------
minutecaloriesnarrow_merged :  1325580
----------------------------------------
minutecalorieswide_merged :  21645
----------------------------------------
minuteintensitiesnarrow_merged :  1325580
----------------------------------------
minuteintensitieswide_merged :  21645
----------------------------------------
minutemetsnarrow_merged :  1325580
----------------------------------------
minutesleep_merged :  188521
--------

# Unique user counts

In [35]:
with engine.connect() as conn:
        result = conn.execute(
            text(f"SELECT COUNT(DISTINCT ID) FROM dailyactivity_merged")
        ).scalar()
print(f'Unique Users:{result}')


Unique Users:33


# Date coverage per table

In [38]:
with engine.connect() as conn:
        result = conn.execute(
            text(f"SELECT MIN(ACTIVITYDATE),MAX(ACTIVITYDATE)  FROM dailyactivity_merged")
        ).fetchall()
result

[(datetime.date(2016, 4, 12), datetime.date(2016, 5, 12))]

#### we have found that the data is collected from 2016/04/12 to 2016/05/12(31 days)

# Distributions of steps, calories, sleep

In [56]:
with engine.connect() as conn:
        result = conn.execute(
            text(f"SELECT id, avg(Calories) as  'mean colories' FROM dailyactivity_merged group by id")
        )
df = pd.DataFrame(result.fetchall(), columns=result.keys())
df

Unnamed: 0,id,mean colories
0,1503960366,1816.4194
1,1624580081,1483.3548
2,1644430081,2811.3
3,1844505072,1573.4839
4,1927972279,2172.8065
5,2022484408,2509.9677
6,2026352035,1540.6452
7,2320127002,1724.1613
8,2347167796,2043.4444
9,2873212765,1916.9677


In [52]:
with engine.connect() as conn:
        result = conn.execute(
            text(f"SELECT id,avg(TotalSteps) as mean_steps  FROM dailyactivity_merged group by id")
        )

df = pd.DataFrame(result.fetchall(), columns=result.keys())
df

Unnamed: 0,id,mean_steps
0,1503960366,12116.7419
1,1624580081,5743.9032
2,1644430081,7282.9667
3,1844505072,2580.0645
4,1927972279,916.129
5,2022484408,11370.6452
6,2026352035,5566.871
7,2320127002,4716.871
8,2347167796,9519.6667
9,2873212765,7555.7742


In [57]:
tables

['dailyactivity_merged',
 'dailycalories_merged',
 'dailyintensities_merged',
 'dailysteps_merged',
 'heartrate_seconds_merged',
 'hourlycalories_merged',
 'hourlyintensities_merged',
 'hourlysteps_merged',
 'minutecaloriesnarrow_merged',
 'minutecalorieswide_merged',
 'minuteintensitiesnarrow_merged',
 'minuteintensitieswide_merged',
 'minutemetsnarrow_merged',
 'minutesleep_merged',
 'minutestepsnarrow_merged',
 'minutestepswide_merged',
 'sleepday_merged',
 'weightloginfo_merged']

In [59]:
with engine.connect() as conn:
        result = conn.execute(
            text(f"SELECT id,avg(TotalTimeInBed) as mean_TotalTimeInBed FROM sleepday_merged group by id")
        )

df = pd.DataFrame(result.fetchall(), columns=result.keys())
df

Unnamed: 0,id,mean_TotalTimeInBed
0,1503960366,383.2
1,1644430081,346.0
2,1844505072,961.0
3,1927972279,437.8
4,2026352035,537.6429
5,2320127002,69.0
6,2347167796,491.3333
7,3977333714,461.1429
8,4020332650,379.75
9,4319703577,501.9615


# min and max steps per day by each user

In [12]:
with engine.connect() as conn:
        result = conn.execute(
            text("""SELECT ID,ACTIVITYDAY, STEPTOTAL
                 FROM (SELECT ID, ACTIVITYDAY, STEPTOTAL, RANK() OVER(PARTITION BY ID ORDER BY STEPTOTAL DESC) AS MAX_STEPS,
                                RANK() OVER(PARTITION BY ID ORDER BY STEPTOTAL ) AS MINI_STEPS
                FROM dailysteps_merged) AS MAX_MIN
                WHERE MAX_STEPS =1 OR MINI_STEPS=1
                ORDER BY ID;
                """)
        )

df = pd.DataFrame(result.fetchall(), columns=result.keys())
df

Unnamed: 0,ID,ACTIVITYDAY,STEPTOTAL
0,1503960366,2016-05-12,0
1,1503960366,2016-04-27,18134
2,1624580081,2016-04-15,1510
3,1624580081,2016-05-01,36019
4,1644430081,2016-04-21,1223
...,...,...,...
123,8792009665,2016-05-09,0
124,8792009665,2016-05-10,0
125,8792009665,2016-04-29,8360
126,8877689391,2016-05-02,4790


In [13]:
with engine.connect() as conn:
        result = conn.execute(
            text("""
                SELECT ID,
                        MAX(CASE WHEN rn_max = 1 THEN STEPTOTAL END) AS MAX_STEPS,
                        MAX(CASE WHEN rn_max = 1 THEN ACTIVITYDAY END) AS MAX_DAY,
                        MAX(CASE WHEN rn_min = 1 THEN STEPTOTAL END) AS MIN_STEPS,
                        MAX(CASE WHEN rn_min = 1 THEN ACTIVITYDAY END) AS MIN_DAY
                FROM (
                        SELECT *,
                                ROW_NUMBER() OVER (PARTITION BY ID ORDER BY STEPTOTAL DESC) AS rn_max,
                                ROW_NUMBER() OVER (PARTITION BY ID ORDER BY STEPTOTAL ASC)  AS rn_min
                        FROM dailysteps_merged
                        ) t
                GROUP BY ID;
                """)
        )

df = pd.DataFrame(result.fetchall(), columns=result.keys())
df

Unnamed: 0,ID,MAX_STEPS,MAX_DAY,MIN_STEPS,MIN_DAY
0,1503960366,18134,2016-04-27,0,2016-05-12
1,1624580081,36019,2016-05-01,1510,2016-04-15
2,1644430081,18213,2016-04-30,1223,2016-04-21
3,1844505072,8054,2016-04-21,0,2016-04-24
4,1927972279,3790,2016-05-02,0,2016-04-16
5,2022484408,18387,2016-04-30,3292,2016-05-08
6,2026352035,12357,2016-04-23,254,2016-05-10
7,2320127002,10725,2016-04-12,772,2016-05-01
8,2347167796,22244,2016-04-16,42,2016-04-29
9,2873212765,9685,2016-04-16,2524,2016-04-17


In [29]:
with engine.connect() as conn:
        result = conn.execute(
            text("""
                WITH LOWSTEPS AS (SELECT ID, COUNT(*) AS TOTAL_LOW_STEPS_DAYS, 
                SUM(CASE WHEN TOTALSTEPS=0 THEN 1 ELSE 0 END) AS ZERO_STEPS,
                SUM(CASE WHEN TOTALSTEPS<=500 AND TOTALSTEPS>0 THEN 1 ELSE 0 END) AS LOW_STEPS
                FROM dailyactivity_merged
                WHERE TOTALSTEPS<=500
                GROUP BY ID)
                 
                SELECT *,
                CASE WHEN TOTAL_LOW_STEPS_DAYS>=5 THEN "INCONSISTENT_USER" ELSE "NORMAL_USER" END AS USER_STATUS
                FROM LOWSTEPS;
                """)
        )

df = pd.DataFrame(result.fetchall(), columns=result.keys())
df

Unnamed: 0,ID,TOTAL_LOW_STEPS_DAYS,ZERO_STEPS,LOW_STEPS,USER_STATUS
0,1503960366,1,1,0,NORMAL_USER
1,1844505072,14,10,4,INCONSISTENT_USER
2,1927972279,18,14,4,INCONSISTENT_USER
3,2026352035,1,0,1,NORMAL_USER
4,2347167796,1,0,1,NORMAL_USER
5,4020332650,18,14,4,INCONSISTENT_USER
6,4057192912,1,1,0,NORMAL_USER
7,4319703577,2,0,2,NORMAL_USER
8,4702921684,1,1,0,NORMAL_USER
9,5577150313,2,2,0,NORMAL_USER


>* Here we have extracted low steps day or zero steps days per user
>* we have considered users who have totalsteps less than or equal 500 as lowsteps
>* further we have classified users as normal users and inconsistent_user
>* inconsistent_users are the user who are have total low step days more than or equal to 5 

# **calories**

In [33]:
with engine.connect() as conn:
    result = conn.execute(
        text("""
            SELECT 
                ID,
                MAX(CASE WHEN rn_max = 1 THEN CALORIES END) AS MAX_CALORIES,
                MAX(CASE WHEN rn_max = 1 THEN ActivityDate END) AS MAX_DAY,
                MAX(CASE WHEN rn_min = 1 THEN CALORIES END) AS MIN_CALORIES,
                MAX(CASE WHEN rn_min = 1 THEN ActivityDate END) AS MIN_DAY
            FROM (
                SELECT *,
                       ROW_NUMBER() OVER (PARTITION BY ID ORDER BY CALORIES DESC) AS rn_max,
                       ROW_NUMBER() OVER (PARTITION BY ID ORDER BY CALORIES ASC)  AS rn_min
                FROM dailyactivity_merged
            ) t
            GROUP BY ID;
        """)
    )

df = pd.DataFrame(result.fetchall(), columns=result.keys())
df


Unnamed: 0,ID,MAX_CALORIES,MAX_DAY,MIN_CALORIES,MIN_DAY
0,1503960366,2159,2016-04-27,0,2016-05-12
1,1624580081,2690,2016-05-01,1002,2016-05-12
2,1644430081,3846,2016-04-30,1276,2016-05-11
3,1844505072,2130,2016-04-14,665,2016-05-12
4,1927972279,2638,2016-04-26,1383,2016-05-12
5,2022484408,3158,2016-04-21,1848,2016-05-08
6,2026352035,1926,2016-05-05,1141,2016-05-10
7,2320127002,2124,2016-04-12,1125,2016-05-12
8,2347167796,2670,2016-04-16,403,2016-04-29
9,2873212765,2241,2016-04-22,1431,2016-05-12


>* we have created a table which shows min calories and max calories and corrosponding days 

# **Time in bed**

In [38]:
with engine.connect() as conn:
    result = conn.execute(
        text("""
            SELECT 
                ID,
                MAX(CASE WHEN rn_max = 1 THEN TotalTimeInBed END) AS MAX_TimeInBed,
                MAX(CASE WHEN rn_max = 1 THEN SleepDay END) AS MAX_DAY,
                MAX(CASE WHEN rn_min = 1 THEN TotalTimeInBed END) AS MIN_TimeInBed,
                MAX(CASE WHEN rn_min = 1 THEN SleepDay END) AS MIN_DAY
            FROM (
                SELECT *,
                       ROW_NUMBER() OVER (PARTITION BY ID ORDER BY TotalTimeInBed DESC) AS rn_max,
                       ROW_NUMBER() OVER (PARTITION BY ID ORDER BY TotalTimeInBed ASC)  AS rn_min
                FROM sleepday_merged
            ) t
            GROUP BY ID;
        """)
    )

df = pd.DataFrame(result.fetchall(), columns=result.keys())
df


Unnamed: 0,ID,MAX_TimeInBed,MAX_DAY,MIN_TimeInBed,MIN_DAY
0,1503960366,712,2016-04-17,264,2016-05-05
1,1644430081,961,2016-05-02,127,2016-04-29
2,1844505072,961,2016-04-15,961,2016-04-15
3,1927972279,775,2016-04-12,178,2016-04-28
4,2026352035,607,2016-04-30,380,2016-05-10
5,2320127002,69,2016-04-23,69,2016-04-23
6,2347167796,602,2016-04-17,386,2016-04-23
7,3977333714,626,2016-05-01,305,2016-04-20
8,4020332650,541,2016-04-12,77,2016-04-16
9,4319703577,722,2016-04-23,65,2016-04-21


>* we have created a table which shows min time in bed and max timeinbed and corrosponding days 