In [217]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [218]:
df = pd.read_csv('data/sample_data.csv').drop(columns=['Unnamed: 0'])
df_duration = df[df['duration_min'] < 30] #filter extreme cases. keep only logical smartphone interactions

# Quantile classes based on duration 

In [219]:
quantiles = df_duration['duration_min'].quantile([.33, .66, .98, 1])

In [220]:
quantiles

0.33     0.333
0.66     1.167
0.98    12.683
1.00    29.533
Name: duration_min, dtype: float64

In [221]:
#labeling of groups
df_duration['duration_group'] = 'extreme'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.98]] = 'high'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.66]] = 'mid'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.33]] = 'low'
df_duration.head(10)

Unnamed: 0,user_id,date,year,month,day,hour,minute,second,hour_period,duration_min,start_time,end_time,session,duration_group
0,8953,2016-12-24,2016,Dec,Saturday,6,11,31,06:00-06:30,1.5,2016-12-24 06:11:31,2016-12-24 06:13:01,1,high
1,3633,2017-02-08,2017,Feb,Wednesday,16,22,53,16:00-16:30,0.333,2017-02-08 16:22:53,2017-02-08 16:23:13,1,low
2,3633,2016-12-09,2016,Dec,Friday,17,36,52,17:30-18:00,0.167,2016-12-09 17:36:52,2016-12-09 17:37:02,1,low
3,3633,2018-06-03,2018,Jun,Sunday,21,20,16,21:00-21:30,0.167,2018-06-03 21:20:16,2018-06-03 21:20:26,1,low
4,3633,2016-09-16,2016,Sep,Friday,9,16,39,09:00-09:30,0.167,2016-09-16 09:16:39,2016-09-16 09:16:49,1,low
5,3633,2018-08-16,2018,Aug,Thursday,22,25,49,22:00-22:30,1.167,2018-08-16 22:25:49,2018-08-16 22:26:59,1,mid
6,3633,2016-08-06,2016,Aug,Saturday,13,0,21,13:00-13:30,0.667,2016-08-06 13:00:21,2016-08-06 13:01:01,1,mid
7,3633,2017-04-13,2017,Apr,Thursday,14,6,17,14:00-14:30,1.667,2017-04-13 14:06:17,2017-04-13 14:07:57,1,high
8,3633,2018-04-01,2018,Apr,Sunday,14,35,26,14:30-15:00,1.167,2018-04-01 14:35:26,2018-04-01 14:36:36,1,mid
9,3633,2017-11-04,2017,Nov,Saturday,14,39,36,14:30-15:00,0.167,2017-11-04 14:39:36,2017-11-04 14:39:46,1,low


# Quantile classes based on Daily Sessions. [Thinking we do classify the sessions not daily but also every 30 mins]

In [222]:
df_session = df_duration.groupby(['user_id', 'date', 'hour_period'])['duration_min'].count().reset_index(name='30min sessions')
df_session.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions
0,389,2017-06-03,23:00-23:30,1
1,389,2017-06-04,12:00-12:30,1
2,389,2017-06-04,13:30-14:00,1
3,389,2017-06-04,14:00-14:30,1
4,389,2017-06-04,15:30-16:00,1


In [223]:
quantiles = df_session['30min sessions'].quantile([.33, .66, .98, 1])
quantiles

0.33    1.0
0.66    1.0
0.98    2.0
1.00    4.0
Name: 30min sessions, dtype: float64

In [224]:
#labeling of groups
df_session['session_group'] = 'extreme'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.98]] = 'high'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.66]] = 'mid'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.33]] = 'low'

## Stacked bar chart for daily sessions

In [225]:
df_session[(df_session['user_id'] == 389) & (df_session['date'] == '2017-07-02')]

Unnamed: 0,user_id,date,hour_period,30min sessions,session_group
23,389,2017-07-02,04:00-04:30,2,high
24,389,2017-07-02,10:30-11:00,1,low
25,389,2017-07-02,12:30-13:00,1,low
26,389,2017-07-02,13:00-13:30,1,low
27,389,2017-07-02,14:00-14:30,2,high
28,389,2017-07-02,15:30-16:00,1,low
29,389,2017-07-02,17:30-18:00,1,low


In [226]:
bar_chart_sessions_389 = df_session[(df_session['user_id'] == 389) & (df_session['date'] == '2017-07-02')]

import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_sessions_389, x="hour_period", y="30min sessions", color="session_group", title=f"User 389 - 2017-07-02")
fig.show()

In [227]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

bar_chart_durations_389['day'] = pd.Categorical(bar_chart_durations_389['day'], categories=days, ordered=True)

In [228]:
user = 389

bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')

import plotly.express as px

long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389[10:30], x="date", y="Daily Sessions", color="session_group", title=f"User {user} - Daily Charts")
# fig.update_layout(barmode='relative')
fig.show()

In [229]:
user = 389

# bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')
bar_chart_daily_sessions_389_total_days = bar_chart_daily_sessions_389.groupby('date').sum()['Daily Sessions'].reset_index(name='Total Sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_daily_sessions_389, bar_chart_daily_sessions_389_total_days, how='left', left_on='date', right_on='date')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Daily Sessions'] / bar_chart_daily_sessions_389_100pec['Total Sessions']) * 100


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389_100pec[10:30], x="date", y="Percentage", color="session_group", title=f"User {user} - Daily Charts based on Session Classifications")
# fig.update_layout(barmode='relative')
fig.show()

In [230]:
# do the same for days of week

In [231]:
cats = [
    '00:00-01:00', '01:00-02:00', '02:00-03:00', '03:00-04:00', '04:00-05:00','05:00-06:00', '06:00-07:00', '07:00-08:00',
    '08:00-09:00', '09:00-10:00', '10:00-11:00','11:00-12:00', '12:00-13:00', '13:00-14:00','14:00-15:00', '15:00-16:00', 
    '16:00-17:00', '17:00-18:00', '18:00-19:00', '19:00-20:00','20:00-21:00', '21:00-22:00', '22:00-23:00','23:00-24:00'
    ]

df_session_daily['hour_period'] = pd.Categorical(df_session_daily['hour_period'], categories=cats, ordered=True)

In [232]:
df_session_daily = df_duration.groupby(['user_id', 'day', 'hour_period'])['duration_min'].count().reset_index(name='30min sessions')
quantiles = df_session_daily['30min sessions'].quantile([.33, .66, .98, 1])


#labeling of groups
df_session_daily['session_group'] = 'extreme'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.98]] = 'high'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.66]] = 'mid'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.33]] = 'low'
# df_session_daily[df_session_daily['user_id'] == 389].groupby(['day', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions').head(10)

bar_chart_sessions_389 = df_session_daily[(df_session_daily['user_id'] == 389) & (df_session_daily['day'] == 'Monday')]

import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_sessions_389, x='hour_period', y="30min sessions", color="session_group", category_orders={"hour_period":cats, 'session_group':['low', 'mid', 'high', 'extreme']}, title=f"User 389 - Monday chart classification based on Session")
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [233]:
df_session_daily = df_duration.groupby(['user_id', 'day', 'hour_period'])['duration_min'].count().reset_index(name='30min sessions')
quantiles = df_session_daily['30min sessions'].quantile([.33, .66, .98, 1])


#labeling of groups
df_session_daily['session_group'] = 'extreme'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.98]] = 'high'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.66]] = 'mid'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.33]] = 'low'
# df_session_daily[df_session_daily['user_id'] == 389].groupby(['day', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions').head(10)

bar_chart_sessions_389 = df_session_daily[(df_session_daily['user_id'] == 389)] # & (df_session_daily['day'] == 'Monday')

import plotly.express as px

# long_df = px.data.medals_long()

test2 = bar_chart_sessions_389.groupby(['day', 'session_group']).sum().reset_index()


fig = px.bar(test2, x='day', y="30min sessions", color="session_group", category_orders={'day': days, 'session_group':['low', 'mid', 'high', 'extreme']}, title=f"User 389 - Monday chart classification based on Session")
#fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [234]:
bar_chart_daily_sessions_389_total_sessions = bar_chart_durations_389.groupby('day').sum()['Times_In_Duration_Group'].reset_index(name='30min sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_durations_389, bar_chart_daily_sessions_389_total_sessions, how='left', left_on='day', right_on='day')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Times_In_Duration_Group'] / bar_chart_daily_sessions_389_100pec['30min sessions_y']) * 100


In [235]:
df_session_daily = df_duration.groupby(['user_id', 'day', 'hour_period'])['duration_min'].count().reset_index(name='30min sessions')
quantiles = df_session_daily['30min sessions'].quantile([.33, .66, .98, 1])


#labeling of groups
df_session_daily['session_group'] = 'extreme'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.98]] = 'high'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.66]] = 'mid'
df_session_daily['session_group'][df_session_daily['30min sessions'] <= quantiles[.33]] = 'low'
# df_session_daily[df_session_daily['user_id'] == 389].groupby(['day', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions').head(10)

bar_chart_sessions_389 = df_session_daily[(df_session_daily['user_id'] == 389)] # & (df_session_daily['day'] == 'Monday')

import plotly.express as px

# long_df = px.data.medals_long()

bar_chart_daily_sessions_389_total_sessions = bar_chart_sessions_389.groupby(['day']).sum()['30min sessions'].reset_index(name='30min sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_sessions_389, bar_chart_daily_sessions_389_total_sessions, how='left', left_on=['day'], right_on=['day'])
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['30min sessions_x'] / bar_chart_daily_sessions_389_100pec['30min sessions_y']) * 100

test2 = bar_chart_daily_sessions_389_100pec.groupby(['day', 'session_group']).sum().reset_index()


fig = px.bar(test2, x='day', y="Percentage", color="session_group", category_orders={'day': days, 'session_group':['low', 'mid', 'high', 'extreme']}, title=f"User 389 - Monday chart classification based on Session")
# fig.update_xaxes(categoryorder='category ascending')
fig.show()

## Stacked bar chart for daily durations

In [236]:
df_duration_agg = df_duration.groupby(['user_id', 'date', 'hour_period'])['duration_min'].sum().reset_index(name='30min sessions')
# df_duration_agg = df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group']).agg({'session' : 'count', 'duration_min': 'sum'}).reset_index().head(15) #.reset_index(name=['Times_In_Duration_Group', '30min sessions']).head(15)
df_duration_agg.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions
0,389,2017-06-03,23:00-23:30,0.333
1,389,2017-06-04,12:00-12:30,0.167
2,389,2017-06-04,13:30-14:00,0.167
3,389,2017-06-04,14:00-14:30,0.667
4,389,2017-06-04,15:30-16:00,0.55


In [237]:
df_duration_group_agg = df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group'])['session'].count().reset_index(name='Times_In_Duration_Group')
df_duration_group_agg.head(5)

Unnamed: 0,user_id,date,hour_period,duration_group,Times_In_Duration_Group
0,389,2017-06-03,23:00-23:30,low,1
1,389,2017-06-04,12:00-12:30,low,1
2,389,2017-06-04,13:30-14:00,low,1
3,389,2017-06-04,14:00-14:30,mid,1
4,389,2017-06-04,15:30-16:00,mid,1


In [238]:
united = pd.merge(df_duration_agg, df_duration_group_agg, how='left', on=['user_id', 'date', 'hour_period'] )
united.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions,duration_group,Times_In_Duration_Group
0,389,2017-06-03,23:00-23:30,0.333,low,1
1,389,2017-06-04,12:00-12:30,0.167,low,1
2,389,2017-06-04,13:30-14:00,0.167,low,1
3,389,2017-06-04,14:00-14:30,0.667,mid,1
4,389,2017-06-04,15:30-16:00,0.55,mid,1


In [239]:

bar_chart_durations_389 = united[(united['user_id'] == 389) & (united['date'] == '2017-07-02')]


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_durations_389, x="hour_period", y="Times_In_Duration_Group", color="duration_group", title=f"User 389 - 2017-07-02")
fig.show()

In [240]:
user = 389

bar_chart_daily_sessions_389 = united[united['user_id'] == user].groupby(['date', 'duration_group'])['Times_In_Duration_Group'].count().reset_index(name='Daily Sessions')

import plotly.express as px

long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389[10:33], x="date", y="Daily Sessions", color="duration_group", title=f"User {user} - Daily Charts")
# fig.update_layout(barmode='relative')
fig.show()

In [241]:
user = 389

# bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')
bar_chart_daily_sessions_389_total_days = bar_chart_daily_sessions_389.groupby('date').sum()['Daily Sessions'].reset_index(name='Total Sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_daily_sessions_389, bar_chart_daily_sessions_389_total_days, how='left', left_on='date', right_on='date')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Daily Sessions'] / bar_chart_daily_sessions_389_100pec['Total Sessions']) * 100


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389_100pec[10:33], x="date", y="Percentage", color="duration_group", title=f"User {user} - Daily Charts based on Session Classifications")
# fig.update_layout(barmode='relative')
fig.show()

In [242]:
# ability to choose hour range 

In [243]:
df_duration_agg = df_duration.groupby(['user_id', 'day', 'hour_period'])['duration_min'].sum().reset_index(name='30min sessions')
# df_duration_agg = df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group']).agg({'session' : 'count', 'duration_min': 'sum'}).reset_index().head(15) #.reset_index(name=['Times_In_Duration_Group', '30min sessions']).head(15)

df_duration_group_agg = df_duration.groupby(['user_id', 'day', 'hour_period', 'duration_group'])['session'].count().reset_index(name='Times_In_Duration_Group')

united = pd.merge(df_duration_agg, df_duration_group_agg, how='left', on=['user_id', 'day', 'hour_period'] )


bar_chart_durations_389 = united[(united['user_id'] == 389) & (united['day'] == 'Monday')]


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_durations_389, x="hour_period", y="Times_In_Duration_Group", color="duration_group", title=f"User 389 - 2017-07-02")
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [244]:
# user = 389

# bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')
bar_chart_daily_sessions_389_total_sessions = bar_chart_durations_389.groupby('hour_period').sum()['Times_In_Duration_Group'].reset_index(name='30min sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_durations_389, bar_chart_daily_sessions_389_total_sessions, how='left', left_on='hour_period', right_on='hour_period')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Times_In_Duration_Group'] / bar_chart_daily_sessions_389_100pec['30min sessions_y']) * 100


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389_100pec, x="hour_period", y="Percentage", color="duration_group", title=f"User {user} - Monday chart classification based on Duration", category_orders={'duration_group':['low', 'mid', 'high', 'extreme']})
# fig.update_layout(barmode='relative')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [250]:
df_duration_agg = df_duration.groupby(['user_id', 'day', 'hour_period'])['duration_min'].sum().reset_index(name='30min sessions')
# df_duration_agg = df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group']).agg({'session' : 'count', 'duration_min': 'sum'}).reset_index().head(15) #.reset_index(name=['Times_In_Duration_Group', '30min sessions']).head(15)

df_duration_group_agg = df_duration.groupby(['user_id', 'day', 'hour_period', 'duration_group'])['session'].count().reset_index(name='Times_In_Duration_Group')

united = pd.merge(df_duration_agg, df_duration_group_agg, how='left', on=['user_id', 'day', 'hour_period'] )


bar_chart_durations_389 = united[(united['user_id'] == 389)] # & (united['day'] == 'Monday')

#bar_chart_durations_389.head(5)

In [251]:
# import plotly.express as px

# # long_df = px.data.medals_long()

# fig = px.bar(bar_chart_durations_389, x="day", y="Times_In_Duration_Group", color="duration_group", title=f"User 389 - 2017-07-02", category_orders={'day': days, 'duration_group':['low', 'mid', 'high', 'extreme']})
# #fig.update_xaxes(categoryorder='category ascending')
# fig.show()

In [252]:
# user = 389

# bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')
bar_chart_daily_sessions_389_total_sessions = bar_chart_durations_389.groupby('day').sum()['Times_In_Duration_Group'].reset_index(name='30min sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_durations_389, bar_chart_daily_sessions_389_total_sessions, how='left', left_on='day', right_on='day')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Times_In_Duration_Group'] / bar_chart_daily_sessions_389_100pec['30min sessions_y']) * 100


test = bar_chart_daily_sessions_389_100pec.groupby(['day', 'duration_group']).sum().reset_index()

# long_df = px.data.medals_long()

fig = px.bar(test, x="day", y="Percentage", color="duration_group", title=f"User {user} - Monday chart classification based on Duration", category_orders={'day': days, 'duration_group':['low', 'mid', 'high', 'extreme']})
# fig.update_layout(barmode='relative')
#fig.update_xaxes(categoryorder='category ascending')
fig.show()