In [None]:
import pandas as pd

#working on a copy
web_merged = pd.read_csv("../Data/Processed/df_final_web_merged.csv")
df = web_merged.copy()

#timestamp creation 
df["date_time"] = pd.to_datetime(
    df["date"].astype(str) + " " + df["time"].astype(str),
    errors="coerce"
)

#calculating active time per visit
visit_level = (
    df.groupby(["client_id", "visit_id"])["date_time"]
      .agg(visit_start="min", visit_end="max")
      .reset_index()
)

visit_level["visit_active_seconds"] = (
    visit_level["visit_end"] - visit_level["visit_start"]
).dt.total_seconds().fillna(0)

#assigning each visit to a month (month start date)
visit_level["month"] = visit_level["visit_start"].dt.to_period("M").dt.to_timestamp()

#aggregateing to client_id x month ---
df_client_usage = (
    visit_level.groupby(["client_id", "month"], as_index=False)
      .agg(
          visits=("visit_id", "nunique"),
          total_active_seconds=("visit_active_seconds", "sum")
      )
)

#adding a month label for Tableau friendly format
df_client_usage["month_label"] = pd.to_datetime(df_client_usage["month"]).dt.strftime("%d-%b-%Y")

#active time in mm:ss
secs = df_client_usage["total_active_seconds"].round().astype("int64")
df_client_usage["total_active_mmss"] = (
    (secs // 60).astype(str).str.zfill(2) + ":" + (secs % 60).astype(str).str.zfill(2)
)

#saving as a new dataset
df_client_usage.to_csv("../Data/Processed/client_usage.csv", index=False)

print(df_client_usage.head())


   client_id      month  visits  total_active_seconds  month_label  \
0        169 2017-04-01       1                 213.0  01-Apr-2017   
1        336 2017-06-01       1                 948.0  01-Jun-2017   
2        546 2017-06-01       1                 133.0  01-Jun-2017   
3        555 2017-04-01       1                 158.0  01-Apr-2017   
4        647 2017-04-01       1                 377.0  01-Apr-2017   

  total_active_mmss  
0             03:33  
1             15:48  
2             02:13  
3             02:38  
4             06:17  


In [None]:
#sampling to see how the clients who had multiple visits are mentioned
repeat_clients = (
    df_client_usage
        .groupby("client_id")["visits"]
        .sum()
        .loc[lambda x: x > 1]
        .index
)
df_client_usage[df_client_usage["client_id"].isin(repeat_clients)] \
    .sort_values(["client_id", "month"]) \
    .head(20)


Unnamed: 0,client_id,month,visits,total_active_seconds,month_label,total_active_mmss
7,805,2017-06-01,3,218.0,01-Jun-2017,03:38
12,1104,2017-06-01,2,0.0,01-Jun-2017,00:00
13,1186,2017-04-01,2,22.0,01-Apr-2017,00:22
17,1336,2017-05-01,2,293.0,01-May-2017,04:53
23,1516,2017-04-01,2,1199.0,01-Apr-2017,19:59
26,1621,2017-06-01,2,509.0,01-Jun-2017,08:29
27,1643,2017-04-01,2,133.0,01-Apr-2017,02:13
28,1643,2017-06-01,1,688.0,01-Jun-2017,11:28
29,1677,2017-03-01,1,143.0,01-Mar-2017,02:23
30,1677,2017-05-01,1,732.0,01-May-2017,12:12
