In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data/sample_data.csv').drop(columns=['Unnamed: 0'])
df_duration = df[df['duration_min'] < 30] #filter extreme cases. keep only logical smartphone interactions

# Quantile classes based on duration 

In [3]:
quantiles = df_duration['duration_min'].quantile([.33, .66, .98, 1])

In [4]:
quantiles

0.33     0.333
0.66     1.167
0.98    12.683
1.00    29.533
Name: duration_min, dtype: float64

In [5]:
#labeling of groups
df_duration['duration_group'] = 'extreme'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.98]] = 'high'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.66]] = 'mid'
df_duration['duration_group'][df_duration['duration_min'] <= quantiles[.33]] = 'low'
df_duration.head(10)

Unnamed: 0,user_id,date,year,month,day,hour,minute,second,hour_period,duration_min,start_time,end_time,session,duration_group
0,8953,2016-12-24,2016,Dec,Saturday,6,11,31,06:00-06:30,1.5,2016-12-24 06:11:31,2016-12-24 06:13:01,1,high
1,3633,2017-02-08,2017,Feb,Wednesday,16,22,53,16:00-16:30,0.333,2017-02-08 16:22:53,2017-02-08 16:23:13,1,low
2,3633,2016-12-09,2016,Dec,Friday,17,36,52,17:30-18:00,0.167,2016-12-09 17:36:52,2016-12-09 17:37:02,1,low
3,3633,2018-06-03,2018,Jun,Sunday,21,20,16,21:00-21:30,0.167,2018-06-03 21:20:16,2018-06-03 21:20:26,1,low
4,3633,2016-09-16,2016,Sep,Friday,9,16,39,09:00-09:30,0.167,2016-09-16 09:16:39,2016-09-16 09:16:49,1,low
5,3633,2018-08-16,2018,Aug,Thursday,22,25,49,22:00-22:30,1.167,2018-08-16 22:25:49,2018-08-16 22:26:59,1,mid
6,3633,2016-08-06,2016,Aug,Saturday,13,0,21,13:00-13:30,0.667,2016-08-06 13:00:21,2016-08-06 13:01:01,1,mid
7,3633,2017-04-13,2017,Apr,Thursday,14,6,17,14:00-14:30,1.667,2017-04-13 14:06:17,2017-04-13 14:07:57,1,high
8,3633,2018-04-01,2018,Apr,Sunday,14,35,26,14:30-15:00,1.167,2018-04-01 14:35:26,2018-04-01 14:36:36,1,mid
9,3633,2017-11-04,2017,Nov,Saturday,14,39,36,14:30-15:00,0.167,2017-11-04 14:39:36,2017-11-04 14:39:46,1,low


# Quantile classes based on Daily Sessions. [Thinking we do classify the sessions not daily but also every 30 mins]

In [6]:
df_session = df_duration.groupby(['user_id', 'date', 'hour_period'])['duration_min'].count().reset_index(name='30min sessions')
df_session.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions
0,389,2017-06-03,23:00-23:30,1
1,389,2017-06-04,12:00-12:30,1
2,389,2017-06-04,13:30-14:00,1
3,389,2017-06-04,14:00-14:30,1
4,389,2017-06-04,15:30-16:00,1


In [7]:
quantiles = df_session['30min sessions'].quantile([.33, .66, .98, 1])
quantiles

0.33    1.0
0.66    1.0
0.98    2.0
1.00    4.0
Name: 30min sessions, dtype: float64

In [8]:
#labeling of groups
df_session['session_group'] = 'extreme'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.98]] = 'high'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.66]] = 'mid'
df_session['session_group'][df_session['30min sessions'] <= quantiles[.33]] = 'low'

## Stacked bar chart for daily sessions

In [9]:
df_session[(df_session['user_id'] == 389) & (df_session['date'] == '2017-07-02')]

Unnamed: 0,user_id,date,hour_period,30min sessions,session_group
23,389,2017-07-02,04:00-04:30,2,high
24,389,2017-07-02,10:30-11:00,1,low
25,389,2017-07-02,12:30-13:00,1,low
26,389,2017-07-02,13:00-13:30,1,low
27,389,2017-07-02,14:00-14:30,2,high
28,389,2017-07-02,15:30-16:00,1,low
29,389,2017-07-02,17:30-18:00,1,low


In [10]:
bar_chart_sessions_389 = df_session[(df_session['user_id'] == 389) & (df_session['date'] == '2017-07-02')]

import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_sessions_389, x="hour_period", y="30min sessions", color="session_group", title=f"User 389 - 2017-07-02")
fig.show()

In [11]:
df_session.groupby(['user_id', 'date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions').head(10)

Unnamed: 0,user_id,date,session_group,Daily Sessions
0,389,2017-06-03,low,1
1,389,2017-06-04,low,5
2,389,2017-06-05,high,1
3,389,2017-06-05,low,2
4,389,2017-06-06,low,2
5,389,2017-06-07,high,1
6,389,2017-06-07,low,1
7,389,2017-06-08,low,2
8,389,2017-06-29,high,1
9,389,2017-06-30,low,1


In [12]:
df_session[df_session['user_id'] == 389].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions').head(10)

Unnamed: 0,date,session_group,Daily Sessions
0,2017-06-03,low,1
1,2017-06-04,low,5
2,2017-06-05,high,1
3,2017-06-05,low,2
4,2017-06-06,low,2
5,2017-06-07,high,1
6,2017-06-07,low,1
7,2017-06-08,low,2
8,2017-06-29,high,1
9,2017-06-30,low,1


In [13]:
user = 389

bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')

import plotly.express as px

long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389[10:30], x="date", y="Daily Sessions", color="session_group", title=f"User {user} - Daily Charts")
# fig.update_layout(barmode='relative')
fig.show()

In [14]:
user = 389

# bar_chart_daily_sessions_389 = df_session[df_session['user_id'] == user].groupby(['date', 'session_group'])['30min sessions'].count().reset_index(name='Daily Sessions')
bar_chart_daily_sessions_389_total_days = bar_chart_daily_sessions_389.groupby('date').sum()['Daily Sessions'].reset_index(name='Total Sessions') #.reset_index(name='Total Sessions')
bar_chart_daily_sessions_389_100pec = pd.merge(bar_chart_daily_sessions_389, bar_chart_daily_sessions_389_total_days, how='left', left_on='date', right_on='date')
bar_chart_daily_sessions_389_100pec['Percentage'] = (bar_chart_daily_sessions_389_100pec['Daily Sessions'] / bar_chart_daily_sessions_389_100pec['Total Sessions']) * 100


import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_daily_sessions_389_100pec[10:30], x="date", y="Percentage", color="session_group", title=f"User {user} - Daily Charts based on Session Classifications")
# fig.update_layout(barmode='relative')
fig.show()

## Stacked bar chart for daily durations

In [15]:
df_duration.head(10)

Unnamed: 0,user_id,date,year,month,day,hour,minute,second,hour_period,duration_min,start_time,end_time,session,duration_group
0,8953,2016-12-24,2016,Dec,Saturday,6,11,31,06:00-06:30,1.5,2016-12-24 06:11:31,2016-12-24 06:13:01,1,high
1,3633,2017-02-08,2017,Feb,Wednesday,16,22,53,16:00-16:30,0.333,2017-02-08 16:22:53,2017-02-08 16:23:13,1,low
2,3633,2016-12-09,2016,Dec,Friday,17,36,52,17:30-18:00,0.167,2016-12-09 17:36:52,2016-12-09 17:37:02,1,low
3,3633,2018-06-03,2018,Jun,Sunday,21,20,16,21:00-21:30,0.167,2018-06-03 21:20:16,2018-06-03 21:20:26,1,low
4,3633,2016-09-16,2016,Sep,Friday,9,16,39,09:00-09:30,0.167,2016-09-16 09:16:39,2016-09-16 09:16:49,1,low
5,3633,2018-08-16,2018,Aug,Thursday,22,25,49,22:00-22:30,1.167,2018-08-16 22:25:49,2018-08-16 22:26:59,1,mid
6,3633,2016-08-06,2016,Aug,Saturday,13,0,21,13:00-13:30,0.667,2016-08-06 13:00:21,2016-08-06 13:01:01,1,mid
7,3633,2017-04-13,2017,Apr,Thursday,14,6,17,14:00-14:30,1.667,2017-04-13 14:06:17,2017-04-13 14:07:57,1,high
8,3633,2018-04-01,2018,Apr,Sunday,14,35,26,14:30-15:00,1.167,2018-04-01 14:35:26,2018-04-01 14:36:36,1,mid
9,3633,2017-11-04,2017,Nov,Saturday,14,39,36,14:30-15:00,0.167,2017-11-04 14:39:36,2017-11-04 14:39:46,1,low


In [16]:
df_duration.groupby(['user_id', 'date', 'hour_period', 'duration_group'])['session'].count().reset_index(name='Times_In_Duration_Group').head(15)
# df_duration_agg.head(5)

Unnamed: 0,user_id,date,hour_period,duration_group,Times_In_Duration_Group
0,389,2017-06-03,23:00-23:30,low,1
1,389,2017-06-04,12:00-12:30,low,1
2,389,2017-06-04,13:30-14:00,low,1
3,389,2017-06-04,14:00-14:30,mid,1
4,389,2017-06-04,15:30-16:00,mid,1
5,389,2017-06-04,20:30-21:00,high,1
6,389,2017-06-05,12:30-13:00,mid,1
7,389,2017-06-05,13:00-13:30,low,2
8,389,2017-06-05,13:30-14:00,low,1
9,389,2017-06-06,15:30-16:00,low,1


In [17]:
df_duration_agg = df_duration.groupby(['user_id', 'date', 'hour_period'])['duration_min'].sum().reset_index(name='30min sessions')
df_duration_agg.head(5)

Unnamed: 0,user_id,date,hour_period,30min sessions
0,389,2017-06-03,23:00-23:30,0.333
1,389,2017-06-04,12:00-12:30,0.167
2,389,2017-06-04,13:30-14:00,0.167
3,389,2017-06-04,14:00-14:30,0.667
4,389,2017-06-04,15:30-16:00,0.55


In [18]:

bar_chart_sessions_389 = df_duration_agg[(df_duration_agg['user_id'] == 389) & (df_duration_agg['date'] == '2017-06-04')]

import plotly.express as px

# long_df = px.data.medals_long()

fig = px.bar(bar_chart_sessions_389, x="hour_period", y="30min sessions", color="session_group", title=f"User 389 - 2017-07-02")
fig.show()

ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['user_id', 'date', 'hour_period', '30min sessions'] but received: session_group

# Creating a highlight visualization

In [None]:
extreme = df_duration[df_duration.duration_group == 'extreme']

In [None]:
df_duration[df_duration.duration_group == 'extreme']

Unnamed: 0,user_id,date,year,month,day,hour,minute,second,hour_period,duration_min,start_time,end_time,session,group
80,3633,2017-09-18,2017,Sep,Monday,11,33,56,11:30-12:00,23.017,2017-09-18 11:33:56,2017-09-18 11:56:57,1,extreme
99,8953,2017-01-15,2017,Jan,Sunday,13,30,6,13:30-14:00,25.067,2017-01-15 13:30:06,2017-01-15 13:55:10,1,extreme
387,3633,2016-12-12,2016,Dec,Monday,20,29,12,20:00-20:30,14.017,2016-12-12 20:29:12,2016-12-12 20:43:13,1,extreme
456,3633,2017-08-15,2017,Aug,Tuesday,21,6,15,21:00-21:30,17.850,2017-08-15 21:06:15,2017-08-15 21:24:06,1,extreme
493,3633,2016-09-13,2016,Sep,Tuesday,7,34,14,07:30-08:00,17.183,2016-09-13 07:34:14,2016-09-13 07:51:25,1,extreme
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9407,906,2017-09-26,2017,Sep,Tuesday,4,43,38,04:30-05:00,25.350,2017-09-26 04:43:38,2017-09-26 05:08:59,1,extreme
9436,906,2017-11-10,2017,Nov,Friday,10,4,8,10:00-10:30,21.533,2017-11-10 10:04:08,2017-11-10 10:25:40,1,extreme
9480,906,2017-10-18,2017,Oct,Wednesday,10,57,38,10:30-11:00,18.183,2017-10-18 10:57:38,2017-10-18 11:15:49,1,extreme
9722,8774,2018-07-06,2018,Jul,Friday,22,25,10,22:00-22:30,21.800,2018-07-06 22:25:10,2018-07-06 22:46:58,1,extreme


In [None]:
extreme = extreme.groupby(['hour_period'])['duration_min'].mean().reset_index(name='Avg duration')
extreme

Unnamed: 0,hour_period,Avg duration
0,01:30-02:00,26.867
1,02:00-02:30,15.017
2,02:30-03:00,25.517
3,03:00-03:30,15.061333
4,03:30-04:00,14.3915
5,04:00-04:30,15.955667
6,04:30-05:00,18.966714
7,05:00-05:30,16.297571
8,05:30-06:00,14.722333
9,06:00-06:30,17.052833


In [None]:
df_3633 = df[(df.user_id == 3633) & (df.year == 2017)].sort_values(['date', 'hour_period'])
df_3633

Unnamed: 0,user_id,date,year,month,day,hour,minute,second,hour_period,duration_min,start_time,end_time,session
303,3633,2017-01-03,2017,Jan,Tuesday,19,19,14,19:00-19:30,1.667,2017-01-03 19:19:14,2017-01-03 19:20:54,1
700,3633,2017-01-04,2017,Jan,Wednesday,9,1,52,09:00-09:30,1.067,2017-01-04 09:01:52,2017-01-04 09:02:56,1
482,3633,2017-01-04,2017,Jan,Wednesday,19,4,1,19:00-19:30,1.450,2017-01-04 19:04:01,2017-01-04 19:05:28,1
200,3633,2017-01-09,2017,Jan,Monday,20,57,16,20:30-21:00,0.150,2017-01-09 20:57:16,2017-01-09 20:57:25,1
205,3633,2017-01-10,2017,Jan,Tuesday,14,30,5,14:30-15:00,0.050,2017-01-10 14:30:05,2017-01-10 14:30:08,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,3633,2017-12-26,2017,Dec,Tuesday,13,8,7,13:00-13:30,11.817,2017-12-26 13:08:07,2017-12-26 13:19:56,1
113,3633,2017-12-29,2017,Dec,Friday,16,56,7,16:30-17:00,0.167,2017-12-29 16:56:07,2017-12-29 16:56:17,1
168,3633,2017-12-29,2017,Dec,Friday,17,43,37,17:30-18:00,0.167,2017-12-29 17:43:37,2017-12-29 17:43:47,1
562,3633,2017-12-30,2017,Dec,Saturday,19,25,56,19:00-19:30,3.667,2017-12-30 19:25:56,2017-12-30 19:29:36,1


In [None]:
import plotly.graph_objects as go

# point

fig = go.Figure(data=go.Scatter(x=df_3633['date'].astype(dtype=str)+df_3633['hour_period'].astype(dtype=str), 
                                y=df_3633['duration_min'],
                                marker_color='black', text="counts"))

fig.add_vrect()

fig.update_layout({"title": 'Tweets about Malioboro from Jan 2020 to Jan 2021 Day by Day',
                   "xaxis": {"title":"Time"},
                   "yaxis": {"title":"Total tweets"},
                   "showlegend": False})
fig.show()

TypeError: add_vrect() missing 2 required positional arguments: 'x0' and 'x1'

In [None]:
# imports
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import datetime

pd.set_option('display.max_rows', None)

df_3633 = df[df.user_id == 3633]

# plotly setup
fig = px.line(df, x='date', y='duration_min')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,255,0.1)')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,255,0.1)')


# function to set background color for a
# specified variable and a specified level
def highLights(fig, variable, level, mode, fillcolor, layer):
    """
    Set a specified color as background for given
    levels of a specified variable using a shape.
    
    Keyword arguments:
    ==================
    fig -- plotly figure
    variable -- column name in a pandas dataframe
    level -- int or float
    mode -- set threshold above or below
    fillcolor -- any color type that plotly can handle
    layer -- position of shape in plotly fiugre, like "below"
    
    """
    
    if mode == 'above':
        m = df[variable].gt(level)
    
    if mode == 'below':
        m = df[variable].lt(level)
        
    df1 = df[m].groupby((m).cumsum())['date'].agg(['first','last'])

    for index, row in df1.iterrows():
        #print(row['first'], row['last'])
        fig.add_shape(type="rect",
                        xref="x",
                        yref="paper",
                        x0=row['first'],
                        y0=0,
                        x1=row['last'],
                        y1=1,
                        line=dict(color="rgba(0,0,0,0)",width=3,),
                        fillcolor=fillcolor,
                        layer=layer) 
    return(fig)

fig = highLights(fig = fig, variable = 'duration_min', level = 5, mode = 'above',
               fillcolor = 'rgba(200,0,200,0.2)', layer = 'below')

fig.update_layout(template = 'plotly_dark')

fig.show()