In [73]:
import pandas as pd
import numpy as np

In [74]:
df2 = pd.read_csv("data/dataset2.csv")


print(df2.shape)
print(df2.columns.tolist())
df2.head()


(1098000, 5)
['customer_id', 'date', 'logins', 'feature_events', 'session_minutes']


Unnamed: 0,customer_id,date,logins,feature_events,session_minutes
0,C100000,2024-01-01,5.0,0.0,32.23
1,C100000,2024-01-02,1.0,1.0,18.08
2,C100000,2024-01-03,3.0,1.0,9.1
3,C100000,2024-01-04,3.0,1.0,37.6
4,C100000,2024-01-05,5.0,2.0,25.2


In [75]:
df2["date_dt"] = pd.to_datetime(df2["date"], errors="coerce")

print("Invalid date rows:", int(df2["date_dt"].isna().sum()))
#df2.loc[df2["date_dt"].isna(), ["customer_id","date"]].head(20)


Invalid date rows: 0


In [76]:
print("Duplicate full rows:", int(df2.duplicated().sum()))
print("Duplicate (customer_id, date):", int(df2.duplicated(subset=["customer_id","date_dt"]).sum()))


Duplicate full rows: 0
Duplicate (customer_id, date): 0


In [77]:
print(df2[["logins","feature_events","session_minutes"]].isna().sum())

df2["month"] = df2["date_dt"].dt.to_period("M").astype(str)
miss_by_month = df2.groupby("month")[["logins","feature_events","session_minutes"]].apply(lambda g: g.isna().mean())
print(miss_by_month)


logins             56010
feature_events     56010
session_minutes    56010
dtype: int64
           logins  feature_events  session_minutes
month                                             
2024-01  0.000000        0.000000         0.000000
2024-02  0.000000        0.000000         0.000000
2024-03  0.000000        0.000000         0.000000
2024-04  0.000000        0.000000         0.000000
2024-05  0.000000        0.000000         0.000000
2024-06  0.000000        0.000000         0.000000
2024-07  0.000000        0.000000         0.000000
2024-08  0.000000        0.000000         0.000000
2024-09  0.622333        0.622333         0.622333
2024-10  0.000000        0.000000         0.000000
2024-11  0.000000        0.000000         0.000000
2024-12  0.000000        0.000000         0.000000


In [78]:
df1 = pd.read_csv("cleanedData/cleaned_dataset1.csv")
df2 = df2.merge(df1[["customer_id","is_eu"]], on="customer_id", how="left")

sept = df2[df2["month"] == "2024-09"].copy()
print("Sep missing counts:", sept[["logins","feature_events","session_minutes"]].isna().sum())
print("Sep missing rate by is_eu:")
print(sept.groupby("is_eu")[["logins","feature_events","session_minutes"]].apply(lambda g: g.isna().mean()))


Sep missing counts: logins             56010
feature_events     56010
session_minutes    56010
dtype: int64
Sep missing rate by is_eu:
       logins  feature_events  session_minutes
is_eu                                         
0         0.0             0.0              0.0
1         1.0             1.0              1.0


In [79]:
# 1) Create the EU-September corruption mask (these are the rows to impute)
df2["eu_sept_2024_corrupted"] = (
    (df2["is_eu"] == 1) &
    (df2["month"] == "2024-09") &
    (df2[["logins","feature_events","session_minutes"]].isna().any(axis=1))
)
mask_missing = df2["eu_sept_2024_corrupted"]
print("Rows to impute (EU Sep 2024):", int(mask_missing.sum()))

# 2) Add weekday (0=Mon,...,6=Sun)
df2["weekday"] = df2["date_dt"].dt.dayofweek

# 3) Build reference window: EU customers in Aug + Oct with complete metrics
ref = df2[
    (df2["is_eu"] == 1) &
    (df2["month"].isin(["2024-08", "2024-10"])) &
    (~df2[["logins","feature_events","session_minutes"]].isna().any(axis=1))
].copy()
print("Reference rows (EU Aug+Oct complete):", ref.shape[0])

# 4) Compute customer+weekday means (best signal)
cw_means = (
    ref.groupby(["customer_id","weekday"])[["logins","feature_events","session_minutes"]]
       .mean()
       .rename(columns=lambda c: f"{c}_cw_mean")
)
df2 = df2.join(cw_means, on=["customer_id","weekday"])

# 5) Fallback 1: customer overall means (if customer-weekday is missing)
c_means = (
    ref.groupby("customer_id")[["logins","feature_events","session_minutes"]]
       .mean()
       .rename(columns=lambda c: f"{c}_c_mean")
)
df2 = df2.join(c_means, on="customer_id")

# 6) Fallback 2: EU weekday means (if customer has too little history)
w_means = (
    ref.groupby("weekday")[["logins","feature_events","session_minutes"]]
       .mean()
       .rename(columns=lambda c: f"{c}_w_mean")
)
df2 = df2.join(w_means, on="weekday")

# 7) Impute missing EU-September metrics in priority order
for metric in ["logins","feature_events","session_minutes"]:
    df2.loc[mask_missing, metric] = (
        df2.loc[mask_missing, metric]
          .fillna(df2.loc[mask_missing, f"{metric}_cw_mean"])
          .fillna(df2.loc[mask_missing, f"{metric}_c_mean"])
          .fillna(df2.loc[mask_missing, f"{metric}_w_mean"])
    )

# Audit flag: which rows were imputed
df2["eu_sept_2024_was_imputed"] = mask_missing

# 8) Enforce valid ranges/types after imputation
for c in ["logins","feature_events"]:
    df2[c] = pd.to_numeric(df2[c], errors="coerce")
    df2[c] = df2[c].clip(lower=0).round().astype("Int64")

df2["session_minutes"] = pd.to_numeric(df2["session_minutes"], errors="coerce")
df2["session_minutes"] = df2["session_minutes"].clip(lower=0, upper=1439)

# 9) Verify September has no remaining missing values for EU after imputation
sept_after = df2[df2["month"] == "2024-09"]
print("Remaining missing in Sep after impute:",
      sept_after[["logins","feature_events","session_minutes"]].isna().sum())

print("Remaining missing rate by is_eu after impute:")
print(sept_after.groupby("is_eu")[["logins","feature_events","session_minutes"]].apply(lambda g: g.isna().mean()))

# # 10) Optional cleanup: drop helper mean columns
# mean_cols = [c for c in df2.columns if c.endswith("_cw_mean") or c.endswith("_c_mean") or c.endswith("_w_mean")]
# df2.drop(columns=mean_cols, inplace=True)

Rows to impute (EU Sep 2024): 56010
Reference rows (EU Aug+Oct complete): 115754
Remaining missing in Sep after impute: logins             0
feature_events     0
session_minutes    0
dtype: int64
Remaining missing rate by is_eu after impute:
       logins  feature_events  session_minutes
is_eu                                         
0         0.0             0.0              0.0
1         0.0             0.0              0.0


In [80]:
drop_cols = ["logins_invalid","feature_events_invalid","session_minutes_invalid","logic_no_login_but_usage"]
df2 = df2.drop(columns=[c for c in drop_cols if c in df2.columns]).copy()
df2[(df2["month"] == "2024-09") & (df2["eu_sept_2024_was_imputed"])]


Unnamed: 0,customer_id,date,logins,feature_events,session_minutes,date_dt,month,is_eu,eu_sept_2024_corrupted,weekday,logins_cw_mean,feature_events_cw_mean,session_minutes_cw_mean,logins_c_mean,feature_events_c_mean,session_minutes_c_mean,logins_w_mean,feature_events_w_mean,session_minutes_w_mean,eu_sept_2024_was_imputed
610,C100001,2024-09-01,1,0,16.662350,2024-09-01,2024-09,1,True,6,1.375000,0.375000,16.662350,1.000000,0.322581,14.277096,3.081749,1.080878,16.665403,True
611,C100001,2024-09-02,1,0,15.539151,2024-09-02,2024-09,1,True,0,1.000000,0.250000,15.539151,1.000000,0.322581,14.277096,3.082217,1.091189,16.699508,True
612,C100001,2024-09-03,1,0,13.789360,2024-09-03,2024-09,1,True,1,0.888889,0.222222,13.789360,1.000000,0.322581,14.277096,3.060346,1.067547,16.637325,True
613,C100001,2024-09-04,1,0,12.449634,2024-09-04,2024-09,1,True,2,1.000000,0.444444,12.449634,1.000000,0.322581,14.277096,3.030768,1.068024,16.568418,True
614,C100001,2024-09-05,1,0,9.107177,2024-09-05,2024-09,1,True,3,1.000000,0.500000,9.107177,1.000000,0.322581,14.277096,3.059829,1.067167,16.758722,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097537,C102998,2024-09-26,5,2,26.980043,2024-09-26,2024-09,1,True,3,4.800000,1.900000,26.980043,4.693548,1.838710,26.881754,3.059829,1.067167,16.758722,True
1097538,C102998,2024-09-27,4,1,24.241003,2024-09-27,2024-09,1,True,4,3.555556,1.000000,24.241003,4.693548,1.838710,26.881754,3.101172,1.080878,16.858277,True
1097539,C102998,2024-09-28,6,2,31.555792,2024-09-28,2024-09,1,True,5,5.888889,2.444444,31.555792,4.693548,1.838710,26.881754,3.096352,1.083854,16.801373,True
1097540,C102998,2024-09-29,5,2,23.320300,2024-09-29,2024-09,1,True,6,4.625000,1.750000,23.320300,4.693548,1.838710,26.881754,3.081749,1.080878,16.665403,True


In [81]:
for c in ["logins","feature_events","session_minutes"]:
    df2[c] = pd.to_numeric(df2[c], errors="coerce")

df2["logins_invalid"] = df2["logins"].notna() & (df2["logins"] < 0)
df2["feature_events_invalid"] = df2["feature_events"].notna() & (df2["feature_events"] < 0)

# auto-logout => max 1439 minutes
df2["session_minutes_invalid"] = df2["session_minutes"].notna() & ~df2["session_minutes"].between(0, 1439)

print("logins invalid:", int(df2["logins_invalid"].sum()))
print("feature_events invalid:", int(df2["feature_events_invalid"].sum()))
print("session_minutes invalid:", int(df2["session_minutes_invalid"].sum()))

df2.loc[df2["session_minutes_invalid"], ["customer_id","date","session_minutes"]].head(20)


logins invalid: 0
feature_events invalid: 0
session_minutes invalid: 0


Unnamed: 0,customer_id,date,session_minutes


In [82]:
df2["logic_no_login_but_usage"] = (
    (df2["logins"].fillna(0) == 0) &
    ((df2["session_minutes"].fillna(0) > 0) | (df2["feature_events"].fillna(0) > 0))
)

print("No login but usage:", int(df2["logic_no_login_but_usage"].sum()))
df2.loc[df2["logic_no_login_but_usage"],
        ["customer_id","date","logins","session_minutes","feature_events"]].head(20)


No login but usage: 103870


Unnamed: 0,customer_id,date,logins,session_minutes,feature_events
29,C100000,2024-01-30,0,16.78,0
66,C100000,2024-03-07,0,11.79,0
113,C100000,2024-04-23,0,11.03,0
135,C100000,2024-05-15,0,14.22,0
213,C100000,2024-08-01,0,2.065431,0
245,C100000,2024-09-02,0,12.593178,0
281,C100000,2024-10-08,0,19.133523,0
300,C100000,2024-10-27,0,6.52719,0
314,C100000,2024-11-10,0,0.799851,0
357,C100000,2024-12-23,0,12.739052,0


In [83]:
drop_cols = ["logic_no_login_but_usage"]
df2_clean = df2.drop(columns=[c for c in drop_cols if c in df2.columns]).copy()

# df2_clean.to_csv("dataset2_cleaned.csv", index=False)
# print("Saved dataset2_cleaned.csv")
