In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# --- STEP 1: LOAD AND CLEAN TRAINING DATA ---
df = pd.read_csv('dat_train1.csv')
df['event_timestamp'] = pd.to_datetime(df['event_timestamp'])



In [None]:
# Remove duplicates to ensure clean feature counts
duplicate_mask = df.duplicated(subset=['customer_id', 'event_name', 'event_timestamp'], keep='first')
df_clean = df[~duplicate_mask].copy()

# --- STEP 2: CREATE LABELS (SUCCESS VS. LAPSE) ---
# Identify Successful customers
success_ids = set(df_clean[df_clean['event_name'] == 'order_shipped']['customer_id'])

# Identify Lapsed customers (No order + inactive for 60 days)
max_date = df_clean['event_timestamp'].max()
last_event = df_clean.sort_values('event_timestamp').groupby('customer_id').tail(1)

def get_label(row):
    if row['customer_id'] in success_ids:
        return 1 # Success
    if (max_date - row['event_timestamp']) >= pd.Timedelta(days=60):
        return 0 # Lapse
    return -1 # Active/Exclude

last_event['label'] = last_event.apply(get_label, axis=1)
labels = last_event[last_event['label'] != -1][['customer_id', 'label']]



In [None]:
# --- STEP 3: FLATTEN FEATURES FOR TRAINING ---
df_clean['is_appl_submit'] = (df_clean['event_name'] == 'application_web_submit').astype(int)
df_clean['is_add_to_cart'] = (df_clean['event_name'] == 'add_to_cart').astype(int)
df_clean['is_browse'] = (df_clean['event_name'] == 'browse_products').astype(int)

df_train_raw = df_clean.groupby('customer_id').agg(
    total_actions=('event_name', 'count'),
    has_applied=('is_appl_submit', 'max'),
    max_items_in_cart=('is_add_to_cart', 'sum'),
    num_unique_products=('is_browse', 'sum')
).reset_index()

# Merge features with labels to create the final df_train
df_train = df_train_raw.merge(labels, on='customer_id')

# --- STEP 4: PREPARE TEST DATA ---
df_open_raw = pd.read_csv('open_journeys1.csv', parse_dates=['event_timestamp'])
df_open_raw['is_appl_submit'] = (df_open_raw['event_name'] == 'application_web_submit').astype(int)
df_open_raw['is_add_to_cart'] = (df_open_raw['event_name'] == 'add_to_cart').astype(int)
df_open_raw['is_browse'] = (df_open_raw['event_name'] == 'browse_products').astype(int)

X_test_safe = df_open_raw.groupby('customer_id').agg(
    total_actions=('event_name', 'count'),
    has_applied=('is_appl_submit', 'max'),
    max_items_in_cart=('is_add_to_cart', 'sum'),
    num_unique_products=('is_browse', 'sum')
).reset_index()

