In [2]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('data/sample_data.csv').drop(columns=['Unnamed: 0'])
df_duration = df[df['duration_min'] < 30] #filter extreme cases. keep only logical smartphone interactions

# Quantile classes based on duration 

In [4]:
quantiles = df_duration['duration_min'].quantile([.33, .66, .98, 1])

In [5]:
quantiles

0.33     0.333
0.66     1.167
0.98    12.683
1.00    29.533
Name: duration_min, dtype: float64

In [6]:
#labeling of groups
df_duration['duration_group'] = 'extreme'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.98]] = 'high'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.66]] = 'mid'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.33]] = 'low'
df_duration.head(10)

Unnamed: 0,user_id,date,year,month,day,hour,minute,second,hour_period,duration_min,start_time,end_time,session,duration_group
0,8953,2016-12-24,2016,Dec,Saturday,6,11,31,06:00-06:30,1.5,2016-12-24 06:11:31,2016-12-24 06:13:01,1,high
1,3633,2017-02-08,2017,Feb,Wednesday,16,22,53,16:00-16:30,0.333,2017-02-08 16:22:53,2017-02-08 16:23:13,1,low
2,3633,2016-12-09,2016,Dec,Friday,17,36,52,17:30-18:00,0.167,2016-12-09 17:36:52,2016-12-09 17:37:02,1,low
3,3633,2018-06-03,2018,Jun,Sunday,21,20,16,21:00-21:30,0.167,2018-06-03 21:20:16,2018-06-03 21:20:26,1,low
4,3633,2016-09-16,2016,Sep,Friday,9,16,39,09:00-09:30,0.167,2016-09-16 09:16:39,2016-09-16 09:16:49,1,low
5,3633,2018-08-16,2018,Aug,Thursday,22,25,49,22:00-22:30,1.167,2018-08-16 22:25:49,2018-08-16 22:26:59,1,mid
6,3633,2016-08-06,2016,Aug,Saturday,13,0,21,13:00-13:30,0.667,2016-08-06 13:00:21,2016-08-06 13:01:01,1,mid
7,3633,2017-04-13,2017,Apr,Thursday,14,6,17,14:00-14:30,1.667,2017-04-13 14:06:17,2017-04-13 14:07:57,1,high
8,3633,2018-04-01,2018,Apr,Sunday,14,35,26,14:30-15:00,1.167,2018-04-01 14:35:26,2018-04-01 14:36:36,1,mid
9,3633,2017-11-04,2017,Nov,Saturday,14,39,36,14:30-15:00,0.167,2017-11-04 14:39:36,2017-11-04 14:39:46,1,low


# Quantile classes based on Daily Sessions. [Thinking we do classify the sessions not daily but also every 30 mins]

In [7]:
df_session = df_duration.groupby(['user_id', 'date', 'hour_period'])['duration_min'].count().reset_index(name='30min sessions')
df_session.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions
0,389,2017-06-03,23:00-23:30,1
1,389,2017-06-04,12:00-12:30,1
2,389,2017-06-04,13:30-14:00,1
3,389,2017-06-04,14:00-14:30,1
4,389,2017-06-04,15:30-16:00,1


In [8]:
quantiles = df_session['30min sessions'].quantile([.33, .66, .98, 1])
quantiles

0.33    1.0
0.66    1.0
0.98    2.0
1.00    4.0
Name: 30min sessions, dtype: float64

In [9]:
#labeling of groups
df_session['session_group'] = 'extreme'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.98]] = 'high'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.66]] = 'mid'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.33]] = 'low'

## Stacked bar chart for daily sessions

In [10]:
df_session[(df_session['user_id'] == 389) & (df_session['date'] == '2017-07-02')]

Unnamed: 0,user_id,date,hour_period,30min sessions,session_group
23,389,2017-07-02,04:00-04:30,2,high
24,389,2017-07-02,10:30-11:00,1,low
25,389,2017-07-02,12:30-13:00,1,low
26,389,2017-07-02,13:00-13:30,1,low
27,389,2017-07-02,14:00-14:30,2,high
28,389,2017-07-02,15:30-16:00,1,low
29,389,2017-07-02,17:30-18:00,1,low


In [11]:
bar_chart_sessions_389 = df_session[(df_session['user_id'] == 389) & (df_session['date'] == '2017-07-02')]

import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_sessions_389, x="hour_period", y="30min sessions", color="session_group", title=f"User 389 - 2017-07-02")
fig.show()

In [12]:
df_session.groupby(['user_id', 'date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions').head(10)

Unnamed: 0,user_id,date,session_group,Daily Sessions
0,389,2017-06-03,low,1
1,389,2017-06-04,low,5
2,389,2017-06-05,high,1
3,389,2017-06-05,low,2
4,389,2017-06-06,low,2
5,389,2017-06-07,high,1
6,389,2017-06-07,low,1
7,389,2017-06-08,low,2
8,389,2017-06-29,high,1
9,389,2017-06-30,low,1


In [13]:
df_session[df_session['user_id'] == 389].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions').head(10)

Unnamed: 0,date,session_group,Daily Sessions
0,2017-06-03,low,1
1,2017-06-04,low,5
2,2017-06-05,high,1
3,2017-06-05,low,2
4,2017-06-06,low,2
5,2017-06-07,high,1
6,2017-06-07,low,1
7,2017-06-08,low,2
8,2017-06-29,high,1
9,2017-06-30,low,1


In [14]:
user = 389

bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')

import plotly.express as px

long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389[10:30], x="date", y="Daily Sessions", color="session_group", title=f"User {user} - Daily Charts")
# fig.update_layout(barmode='relative')
fig.show()

In [15]:
user = 389

# bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')
bar_chart_daily_sessions_389_total_days = bar_chart_daily_sessions_389.groupby('date').sum()['Daily Sessions'].reset_index(name='Total Sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_daily_sessions_389, bar_chart_daily_sessions_389_total_days, how='left', left_on='date', right_on='date')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Daily Sessions'] / bar_chart_daily_sessions_389_100pec['Total Sessions']) * 100


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389_100pec[10:30], x="date", y="Percentage", color="session_group", title=f"User {user} - Daily Charts based on Session Classifications")
# fig.update_layout(barmode='relative')
fig.show()

In [16]:
# do the same for days of week

## Stacked bar chart for daily durations

In [55]:
# df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group'])['session'].count().reset_index(name='Times_In_Duration_Group').head(15)
# # df_duration_agg.head(5)

In [56]:
# df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group']).agg({'session' : 'count', 'duration_min': 'sum'}).reset_index().head(15) #.reset_index(name=['Times_In_Duration_Group', '30min sessions']).head(15)


In [57]:
df_duration_agg = df_duration.groupby(['user_id', 'date', 'hour_period'])['duration_min'].sum().reset_index(name='30min sessions')
# df_duration_agg = df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group']).agg({'session' : 'count', 'duration_min': 'sum'}).reset_index().head(15) #.reset_index(name=['Times_In_Duration_Group', '30min sessions']).head(15)
df_duration_agg.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions
0,389,2017-06-03,23:00-23:30,0.333
1,389,2017-06-04,12:00-12:30,0.167
2,389,2017-06-04,13:30-14:00,0.167
3,389,2017-06-04,14:00-14:30,0.667
4,389,2017-06-04,15:30-16:00,0.55


In [58]:
df_duration_group_agg = df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group'])['session'].count().reset_index(name='Times_In_Duration_Group')
df_duration_group_agg.head(5)

Unnamed: 0,user_id,date,hour_period,duration_group,Times_In_Duration_Group
0,389,2017-06-03,23:00-23:30,low,1
1,389,2017-06-04,12:00-12:30,low,1
2,389,2017-06-04,13:30-14:00,low,1
3,389,2017-06-04,14:00-14:30,mid,1
4,389,2017-06-04,15:30-16:00,mid,1


In [60]:
united = pd.merge(df_duration_agg, df_duration_group_agg, how='left', on=['user_id', 'date', 'hour_period'] )
united.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions,duration_group,Times_In_Duration_Group
0,389,2017-06-03,23:00-23:30,0.333,low,1
1,389,2017-06-04,12:00-12:30,0.167,low,1
2,389,2017-06-04,13:30-14:00,0.167,low,1
3,389,2017-06-04,14:00-14:30,0.667,mid,1
4,389,2017-06-04,15:30-16:00,0.55,mid,1


In [61]:
# df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group']).sum().reset_index(name='30min sessions')

In [62]:
# united[(united['user_id'] == 389) & (united['date'] == '2017-07-02')]

In [63]:
# df_duration_agg[(df_duration_agg['user_id'] == 389) & (df_duration_agg['date'] == '2017-07-02')]

In [64]:

bar_chart_durations_389 = united[(united['user_id'] == 389) & (united['date'] == '2017-07-02')]


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_durations_389, x="hour_period", y="Times_In_Duration_Group", color="duration_group", title=f"User 389 - 2017-07-02")
fig.show()

In [66]:
united.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions,duration_group,Times_In_Duration_Group
0,389,2017-06-03,23:00-23:30,0.333,low,1
1,389,2017-06-04,12:00-12:30,0.167,low,1
2,389,2017-06-04,13:30-14:00,0.167,low,1
3,389,2017-06-04,14:00-14:30,0.667,mid,1
4,389,2017-06-04,15:30-16:00,0.55,mid,1


In [67]:
united[united['user_id'] == user].groupby(['date', 'duration_group'])['Times_In_Duration_Group'].count().reset_index(name='Daily Sessions')

Unnamed: 0,date,duration_group,Daily Sessions
0,2017-06-03,low,1
1,2017-06-04,high,1
2,2017-06-04,low,2
3,2017-06-04,mid,2
4,2017-06-05,low,2
...,...,...,...
1098,2018-12-20,high,1
1099,2018-12-20,low,3
1100,2018-12-21,high,2
1101,2018-12-21,low,3


In [73]:
user = 389

bar_chart_daily_sessions_389 = united[united['user_id'] == user].groupby(['date', 'duration_group'])['Times_In_Duration_Group'].count().reset_index(name='Daily Sessions')

import plotly.express as px

long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389[10:33], x="date", y="Daily Sessions", color="duration_group", title=f"User {user} - Daily Charts")
# fig.update_layout(barmode='relative')
fig.show()

In [74]:
user = 389

# bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')
bar_chart_daily_sessions_389_total_days = bar_chart_daily_sessions_389.groupby('date').sum()['Daily Sessions'].reset_index(name='Total Sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_daily_sessions_389, bar_chart_daily_sessions_389_total_days, how='left', left_on='date', right_on='date')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Daily Sessions'] / bar_chart_daily_sessions_389_100pec['Total Sessions']) * 100


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389_100pec[10:33], x="date", y="Percentage", color="duration_group", title=f"User {user} - Daily Charts based on Session Classifications")
# fig.update_layout(barmode='relative')
fig.show()

In [75]:
bar_chart_daily_sessions_389_100pec[bar_chart_daily_sessions_389_100pec['date'] == '2017-07-06']

Unnamed: 0,date,duration_group,Daily Sessions,Total Sessions,Percentage
28,2017-07-06,high,2,5,40.0
29,2017-07-06,low,2,5,40.0
30,2017-07-06,mid,1,5,20.0
