In [None]:
# Find total time spent by each user per day on an app where we have events login, logout, like
# consecutive login or logout events are to be treated as single with earliest time stamp

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json("event.json",lines=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   event    11 non-null     object
 1   time     11 non-null     object
 2   user_id  11 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 392.0+ bytes


In [5]:
df = pd.read_json("event.json",lines=True,convert_dates=['time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   event    11 non-null     object        
 1   time     11 non-null     datetime64[ns]
 2   user_id  11 non-null     int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 392.0+ bytes


In [6]:
# Extract "date" column from timestamp
import datetime as dt
#df['time'] = df['time'].apply(lambda x : pd.to_datetime(str(x)))
df['date'] = df['time'].dt.date
df

Unnamed: 0,event,time,user_id,date
0,login,2019-11-20 00:14:46,978699,2019-11-20
1,logout,2019-11-20 00:14:46,992210,2019-11-20
2,login,2019-11-20 00:14:46,823323,2019-11-20
3,like,2019-11-20 00:14:47,978699,2019-11-20
4,logout,2019-11-20 00:14:48,978699,2019-11-20
5,logout,2019-11-20 00:14:47,823323,2019-11-20
6,login,2019-11-20 00:14:50,978699,2019-11-20
7,logout,2019-11-20 00:14:57,978699,2019-11-20
8,logout,2019-11-20 00:14:58,978699,2019-11-20
9,login,2019-11-21 00:14:50,978699,2019-11-21


In [7]:
# Select only login and logout events
df3 = df[(df['event']=="login") | (df['event']=="logout")]
df3

Unnamed: 0,event,time,user_id,date
0,login,2019-11-20 00:14:46,978699,2019-11-20
1,logout,2019-11-20 00:14:46,992210,2019-11-20
2,login,2019-11-20 00:14:46,823323,2019-11-20
4,logout,2019-11-20 00:14:48,978699,2019-11-20
5,logout,2019-11-20 00:14:47,823323,2019-11-20
6,login,2019-11-20 00:14:50,978699,2019-11-20
7,logout,2019-11-20 00:14:57,978699,2019-11-20
8,logout,2019-11-20 00:14:58,978699,2019-11-20
9,login,2019-11-21 00:14:50,978699,2019-11-21
10,logout,2019-11-21 00:14:57,978699,2019-11-21


In [8]:
# Sort by user_id, date
df4 = df3.sort_values(by=['user_id','date']).reset_index(drop=True)
df4

Unnamed: 0,event,time,user_id,date
0,login,2019-11-20 00:14:46,823323,2019-11-20
1,logout,2019-11-20 00:14:47,823323,2019-11-20
2,login,2019-11-20 00:14:46,978699,2019-11-20
3,logout,2019-11-20 00:14:48,978699,2019-11-20
4,login,2019-11-20 00:14:50,978699,2019-11-20
5,logout,2019-11-20 00:14:57,978699,2019-11-20
6,logout,2019-11-20 00:14:58,978699,2019-11-20
7,login,2019-11-21 00:14:50,978699,2019-11-21
8,logout,2019-11-21 00:14:57,978699,2019-11-21
9,logout,2019-11-20 00:14:46,992210,2019-11-20


In [9]:
# There can be multiple entries for login or logout in a session; we will consider first timestamp
# Find if previous event was same as current event
df4['prev_event']=df4['event'].shift()
df4

Unnamed: 0,event,time,user_id,date,prev_event
0,login,2019-11-20 00:14:46,823323,2019-11-20,
1,logout,2019-11-20 00:14:47,823323,2019-11-20,login
2,login,2019-11-20 00:14:46,978699,2019-11-20,logout
3,logout,2019-11-20 00:14:48,978699,2019-11-20,login
4,login,2019-11-20 00:14:50,978699,2019-11-20,logout
5,logout,2019-11-20 00:14:57,978699,2019-11-20,login
6,logout,2019-11-20 00:14:58,978699,2019-11-20,logout
7,login,2019-11-21 00:14:50,978699,2019-11-21,logout
8,logout,2019-11-21 00:14:57,978699,2019-11-21,login
9,logout,2019-11-20 00:14:46,992210,2019-11-20,logout


In [10]:
# Filter rows where current event is not same as previous event
df5 = df4[['user_id','date','time']][df4.event != df4.prev_event]
df5

Unnamed: 0,user_id,date,time
0,823323,2019-11-20,2019-11-20 00:14:46
1,823323,2019-11-20,2019-11-20 00:14:47
2,978699,2019-11-20,2019-11-20 00:14:46
3,978699,2019-11-20,2019-11-20 00:14:48
4,978699,2019-11-20,2019-11-20 00:14:50
5,978699,2019-11-20,2019-11-20 00:14:57
7,978699,2019-11-21,2019-11-21 00:14:50
8,978699,2019-11-21,2019-11-21 00:14:57


In [11]:
# Find time difference in seconds between previous event and current event
delta = df5.groupby(['user_id','date'])['time'].diff()/pd.Timedelta(seconds=1)
delta

0    NaN
1    1.0
2    NaN
3    2.0
4    2.0
5    7.0
7    NaN
8    7.0
Name: time, dtype: float64

In [12]:
# Add time difference with previous event column
df5["time_spent"] = delta
df5

Unnamed: 0,user_id,date,time,time_spent
0,823323,2019-11-20,2019-11-20 00:14:46,
1,823323,2019-11-20,2019-11-20 00:14:47,1.0
2,978699,2019-11-20,2019-11-20 00:14:46,
3,978699,2019-11-20,2019-11-20 00:14:48,2.0
4,978699,2019-11-20,2019-11-20 00:14:50,2.0
5,978699,2019-11-20,2019-11-20 00:14:57,7.0
7,978699,2019-11-21,2019-11-21 00:14:50,
8,978699,2019-11-21,2019-11-21 00:14:57,7.0


In [13]:
# Find total time spent by each user per day
df6 = df5.groupby(['user_id','date']).sum()  
df6

Unnamed: 0_level_0,Unnamed: 1_level_0,time_spent
user_id,date,Unnamed: 2_level_1
823323,2019-11-20,1.0
978699,2019-11-20,11.0
978699,2019-11-21,7.0
