In [None]:
import pandas as pd

df = pd.read_csv("../Data/Processed/df_final_web_merged.csv")

#to work on a copy of merged dataset to avoid merge conflicts
df_work = df.copy(deep=True)

#creating timestamp and month-year field
df_work["visit_ts"] = pd.to_datetime(
    df_work["date"].astype(str).str.strip() + " " + df_work["time"].astype(str).str.strip(),
    errors="coerce"
)

df_work["visit_month"] = df_work["visit_ts"].dt.to_period("M").astype(str)

#usage per client (all clients in the web_merged dataframe)
df_client_usage = (
    df_work
    .groupby("client_id", as_index=False)
    .agg(
        num_visits=("visit_id", "nunique"),  # number of visits using visit_id
        num_active_months=("visit_month", "nunique"),
        first_visit=("visit_ts", "min"),
        last_visit=("visit_ts", "max")
    )
)

#saving it as a separate dataframe
df_client_usage.to_csv("../Data/Processed/df_client_usage.csv",index=False)

In [14]:
import pandas as pd

# Load merged web dataset
df = pd.read_csv("../Data/Processed/df_final_web_merged.csv")

# Work on a copy
df_work = df.copy(deep=True)

# ---- Build timestamp ----
df_work["visit_ts"] = pd.to_datetime(
    df_work["date"].astype(str).str.strip() + " " + df_work["time"].astype(str).str.strip(),
    errors="coerce"
)

# ---- 1) Compute session duration per visit ----
# (Each visit_id = one session)
visit_durations = (
    df_work
    .groupby(["client_id", "visit_id"], as_index=False)
    .agg(
        visit_start=("visit_ts", "min"),
        visit_end=("visit_ts", "max")
    )
)

visit_durations["visit_duration_minutes"] = (
    (visit_durations["visit_end"] - visit_durations["visit_start"])
    .dt.total_seconds() / 60
)

# ---- 2) Aggregate to CLIENT level ----
df_client_usage = (
    visit_durations
    .groupby("client_id", as_index=False)
    .agg(
        num_visits=("visit_id", "nunique"),
        total_active_time_minutes=("visit_duration_minutes", "sum"),
        first_visit=("visit_start", "min"),
        last_visit=("visit_end", "max")
    )
)

# ---- 3) Convert first/last visit to MM-YY format ----
df_client_usage["first_visit_month"] = (
    pd.to_datetime(df_client_usage["first_visit"])
    .dt.strftime("%m-%y")
)

df_client_usage["last_visit_month"] = (
    pd.to_datetime(df_client_usage["last_visit"])
    .dt.strftime("%m-%y")
)

# ---- 4) Keep only required columns ----
df_client_usage = df_client_usage[
    [
        "client_id",
        "num_visits",
        "first_visit_month",
        "last_visit_month",
        "total_active_time_minutes",
    ]
]

# ---- Optional: save ----
df_client_usage.to_csv(
    "../Data/Processed/df_client_usage.csv",
    index=False
)

# Sanity check
print(df_client_usage.head())
print("Rows:", df_client_usage.shape[0])


   client_id  num_visits first_visit_month last_visit_month  \
0        169           1             04-17            04-17   
1        336           1             06-17            06-17   
2        546           1             06-17            06-17   
3        555           1             04-17            04-17   
4        647           1             04-17            04-17   

   total_active_time_minutes  
0                   3.550000  
1                  15.800000  
2                   2.216667  
3                   2.633333  
4                   6.283333  
Rows: 120157


In [20]:
import pandas as pd

# Load merged web dataset
df = pd.read_csv("../Data/Processed/df_final_web_merged.csv")

# Work on a copy
df_work = df.copy(deep=True)

# ---- Build timestamp ----
df_work["visit_ts"] = pd.to_datetime(
    df_work["date"].astype(str).str.strip() + " " + df_work["time"].astype(str).str.strip(),
    errors="coerce"
)

# ---- 1) Compute session duration per visit (each visit_id = one session) ----
visit_durations = (
    df_work
    .groupby(["client_id", "visit_id"], as_index=False)
    .agg(
        visit_start=("visit_ts", "min"),
        visit_end=("visit_ts", "max")
    )
)

visit_durations["visit_duration_minutes"] = (
    (visit_durations["visit_end"] - visit_durations["visit_start"])
    .dt.total_seconds() / 60
)

# ---- 2) Aggregate to CLIENT level (your existing df_client_usage) ----
df_client_usage = (
    visit_durations
    .groupby("client_id", as_index=False)
    .agg(
        num_visits=("visit_id", "nunique"),
        total_active_time_minutes=("visit_duration_minutes", "sum"),
        first_visit=("visit_start", "min"),
        last_visit=("visit_end", "max")
    )
)

# ---- 3) Convert first/last visit to MM-YY format ----
df_client_usage["first_visit_month"] = pd.to_datetime(df_client_usage["first_visit"]).dt.strftime("%m-%y")
df_client_usage["last_visit_month"] = pd.to_datetime(df_client_usage["last_visit"]).dt.strftime("%m-%y")

# ---- 4) Add: ALL months visited per client (as a single string) ----
# This answers: "did they visit every month in between?"
client_months = (
    df_work.dropna(subset=["client_id", "visit_ts"])
    .assign(visit_month=df_work["visit_ts"].dt.to_period("M").astype(str))  # "2017-03"
    .groupby("client_id")["visit_month"]
    .apply(lambda s: ", ".join(sorted(s.unique())))
    .reset_index(name="months_visited")
)

# Merge months_visited into df_client_usage
df_client_usage = df_client_usage.merge(client_months, on="client_id", how="left")

# ---- 5) Keep required columns (same names + months_visited) ----
df_client_usage = df_client_usage[
    [
        "client_id",
        "num_visits",
        "first_visit_month",
        "last_visit_month",
        "months_visited",
        "total_active_time_minutes",
    ]
]

# ---- Optional: save ----
df_client_usage.to_csv("../Data/Processed/df_client_usage.csv", index=False)

# ---- Sanity checks ----
print(df_client_usage.head())
print("Rows:", df_client_usage.shape[0])

# Optional: quick check of date span in the dataset
print("Min timestamp:", df_work["visit_ts"].min())
print("Max timestamp:", df_work["visit_ts"].max())


   client_id  num_visits first_visit_month last_visit_month months_visited  \
0        169           1             04-17            04-17        2017-04   
1        336           1             06-17            06-17        2017-06   
2        546           1             06-17            06-17        2017-06   
3        555           1             04-17            04-17        2017-04   
4        647           1             04-17            04-17        2017-04   

   total_active_time_minutes  
0                   3.550000  
1                  15.800000  
2                   2.216667  
3                   2.633333  
4                   6.283333  
Rows: 120157
Min timestamp: 2017-03-15 00:03:03
Max timestamp: 2017-06-20 23:59:57


In [22]:
# Clients who visited in more than one month
multi_month_clients = df_client_usage[
    df_client_usage["months_visited"].str.contains(",", na=False)
]

print("Number of clients visiting in more than one month:", multi_month_clients.shape[0])
print(multi_month_clients[["client_id", "months_visited"]].head(30))


Number of clients visiting in more than one month: 11032
     client_id             months_visited
27        1643           2017-04, 2017-06
28        1677           2017-03, 2017-05
29        1680           2017-03, 2017-06
30        1702           2017-04, 2017-06
32        1755           2017-03, 2017-04
48        3647           2017-04, 2017-06
49        3741           2017-04, 2017-06
58        4603           2017-03, 2017-04
61        4717           2017-04, 2017-05
64        4955           2017-03, 2017-04
71        5459           2017-03, 2017-04
74        5845           2017-04, 2017-05
97        7561           2017-03, 2017-04
126       9767           2017-04, 2017-05
138      11070           2017-03, 2017-04
162      13831           2017-03, 2017-05
173      14638           2017-03, 2017-04
183      15543           2017-03, 2017-06
187      15708           2017-03, 2017-04
198      16291           2017-03, 2017-04
200      16549  2017-03, 2017-04, 2017-05
201      16679     

In [18]:
df_client_usage.shape

(120157, 5)

In [19]:
df_client_usage.columns

Index(['client_id', 'num_visits', 'first_visit_month', 'last_visit_month',
       'total_active_time_minutes'],
      dtype='object')

In [15]:
df_client_usage.head(10)

Unnamed: 0,client_id,num_visits,first_visit_month,last_visit_month,total_active_time_minutes
0,169,1,04-17,04-17,3.55
1,336,1,06-17,06-17,15.8
2,546,1,06-17,06-17,2.216667
3,555,1,04-17,04-17,2.633333
4,647,1,04-17,04-17,6.283333
5,722,1,04-17,04-17,9.983333
6,786,1,06-17,06-17,5.2
7,805,3,06-17,06-17,3.633333
8,832,1,06-17,06-17,2.233333
9,934,1,04-17,04-17,2.366667
