In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# --- STEP 1: LOAD AND CLEAN TRAINING DATA ---
df = pd.read_csv('dat_train1.csv')
df['event_timestamp'] = pd.to_datetime(df['event_timestamp'])



In [2]:
# Remove duplicates to ensure clean feature counts
duplicate_mask = df.duplicated(subset=['customer_id', 'event_name', 'event_timestamp'], keep='first')
df_clean = df[~duplicate_mask].copy()



In [3]:
# --- STEP 2: CREATE LABELS (SUCCESS VS. LAPSE) ---
# Identify Successful customers
success_ids = set(df_clean[df_clean['event_name'] == 'order_shipped']['customer_id'])

# Identify Lapsed customers (No order + inactive for 60 days)
max_date = df_clean['event_timestamp'].max()
last_event = df_clean.sort_values('event_timestamp').groupby('customer_id').tail(1)

def get_label(row):
    if row['customer_id'] in success_ids:
        return 1 # Success
    if (max_date - row['event_timestamp']) >= pd.Timedelta(days=60):
        return 0 # Lapse
    return -1 # Active/Exclude

last_event['label'] = last_event.apply(get_label, axis=1)
labels = last_event[last_event['label'] != -1][['customer_id', 'label']]

In [4]:
# --- STEP 2: CREATE LABELS (SUCCESS VS. LAPSE) ---
# Identify Successful customers
success_ids = set(df_clean[df_clean['event_name'] == 'order_shipped']['customer_id'])

# Identify Lapsed customers (No order + inactive for 60 days)
max_date = df_clean['event_timestamp'].max()
last_event = df_clean.sort_values('event_timestamp').groupby('customer_id').tail(1)

def get_label(row):
    if row['customer_id'] in success_ids:
        return 1 # Success
    if (max_date - row['event_timestamp']) >= pd.Timedelta(days=60):
        return 0 # Lapse
    return -1 # Active/Exclude

last_event['label'] = last_event.apply(get_label, axis=1)
labels = last_event[last_event['label'] != -1][['customer_id', 'label']]


In [5]:
# --- STEP 3: FLATTEN FEATURES FOR TRAINING ---
df_clean['is_appl_submit'] = (df_clean['event_name'] == 'application_web_submit').astype(int)
df_clean['is_add_to_cart'] = (df_clean['event_name'] == 'add_to_cart').astype(int)
df_clean['is_browse'] = (df_clean['event_name'] == 'browse_products').astype(int)

df_train_raw = df_clean.groupby('customer_id').agg(
    total_actions=('event_name', 'count'),
    has_applied=('is_appl_submit', 'max'),
    max_items_in_cart=('is_add_to_cart', 'sum'),
    num_unique_products=('is_browse', 'sum')
).reset_index()

# Merge features with labels to create the final df_train
df_train = df_train_raw.merge(labels, on='customer_id')

In [6]:
def extract_enhanced_features(df_input):
    # Pre-calculate flags
    df_input['is_appl_submit'] = (df_input['event_name'] == 'application_web_submit').astype(int)
    df_input['is_add_to_cart'] = (df_input['event_name'] == 'add_to_cart').astype(int)
    df_input['is_browse'] = (df_input['event_name'] == 'browse_products').astype(int)
    
    features = df_input.groupby('customer_id').agg(
        total_actions=('event_name', 'count'),
        has_applied=('is_appl_submit', 'max'),
        total_carts=('is_add_to_cart', 'sum'),
        total_browses=('is_browse', 'sum'),
        first_action=('event_timestamp', 'min'),
        last_action=('event_timestamp', 'max')
    ).reset_index()
    
    # Calculate Duration in minutes (add 0.1 to avoid div by zero)
    features['duration_min'] = (features['last_action'] - features['first_action']).dt.total_seconds() / 60.0 + 0.1
    
    # --- THE KEY RATIOS ---
    # Velocity: How fast are they clicking? (High velocity = High intent)
    features['action_velocity'] = features['total_actions'] / features['duration_min']
    
    # Cart Consistency: Are they browsing just to browse, or are they adding?
    features['cart_per_browse'] = features['total_carts'] / (features['total_browses'] + 1)
    
    # Application Intent: If they applied, how many actions did it take to get there?
    features['actions_to_app'] = features['total_actions'] * features['has_applied']

    return features.drop(columns=['first_action', 'last_action'])

In [7]:
# --- PRE-STEP: SETUP FLAGS & IDENTIFY SUCCESS ---
# Ensure flags exist before we cut/copy
df_clean['is_appl_submit'] = (df_clean['event_name'] == 'application_web_submit').astype(int)
df_clean['is_add_to_cart'] = (df_clean['event_name'] == 'add_to_cart').astype(int)
df_clean['is_browse'] = (df_clean['event_name'] == 'browse_products').astype(int)

# Identify Success IDs (Global Truth)
success_ids = df_clean[df_clean['event_name'] == 'order_shipped']['customer_id'].unique()

In [8]:
# --- STEP 1: DEFINE THE "END BOUNDARY" FOR EVERYONE ---
# For Success: The journey effectively ends at 'order_shipped'
# For Failure: The journey effectively ends at their last observed event
journey_bounds = df_clean.groupby('customer_id').agg(
    start_time=('event_timestamp', 'min'),
    last_seen=('event_timestamp', 'max')
)

# Isolate the specific timestamp of success (order_shipped)
success_events = df_clean[df_clean['event_name'] == 'order_shipped'][['customer_id', 'event_timestamp']]
success_events = success_events.rename(columns={'event_timestamp': 'success_time'})

# Merge success times into bounds
journey_bounds = journey_bounds.merge(success_events, on='customer_id', how='left')

# Define 'effective_end': Use success_time if available, else use last_seen
journey_bounds['effective_end'] = journey_bounds['success_time'].fillna(journey_bounds['last_seen'])


In [9]:
# --- STEP 2: GENERATE RANDOM CUT TIMES ---
np.random.seed(42)

# Vectorized Random Generation (Much faster than .apply)
# We convert to int64 (nanoseconds) to generate random integers
journey_bounds['start_ns'] = journey_bounds['start_time'].astype('int64')
journey_bounds['end_ns'] = journey_bounds['effective_end'].astype('int64')

# Generate a random point between Start and Effective End
# (If start == end, it just picks that time)
journey_bounds['cut_ns'] = journey_bounds.apply(
    lambda row: np.random.randint(row['start_ns'], row['end_ns']) 
    if row['start_ns'] < row['end_ns'] else row['start_ns'], axis=1
)

journey_bounds['cut_time'] = pd.to_datetime(journey_bounds['cut_ns'], utc=True)

In [10]:
# --- STEP 3: APPLY THE CUT (FILTERING) ---
# Merge the specific cut time back to the main event log
df_aug = df_clean.merge(journey_bounds[['customer_id', 'cut_time']], on='customer_id', how='inner')


In [None]:
def extract_enhanced_features(df_input):
    # Pre-calculate flags
    df_input['is_appl_submit'] = (df_input['event_name'] == 'application_web_submit').astype(int)
    df_input['is_add_to_cart'] = (df_input['event_name'] == 'add_to_cart').astype(int)
    df_input['is_browse'] = (df_input['event_name'] == 'browse_products').astype(int)
    
    features = df_input.groupby('customer_id').agg(
        total_actions=('event_name', 'count'),
        has_applied=('is_appl_submit', 'max'),
        total_carts=('is_add_to_cart', 'sum'),
        total_browses=('is_browse', 'sum'),
        first_action=('event_timestamp', 'min'),
        last_action=('event_timestamp', 'max')
    ).reset_index()
    
    # Calculate Duration in minutes (add 0.1 to avoid div by zero)
    features['duration_min'] = (features['last_action'] - features['first_action']).dt.total_seconds() / 60.0 + 0.1
    
    # --- THE KEY RATIOS ---
    # Velocity: How fast are they clicking? (High velocity = High intent)
    features['action_velocity'] = features['total_actions'] / features['duration_min']
    
    # Cart Consistency: Are they browsing just to browse, or are they adding?
    features['cart_per_browse'] = features['total_carts'] / (features['total_browses'] + 1)
    
    # Application Intent: If they applied, how many actions did it take to get there?
    features['actions_to_app'] = features['total_actions'] * features['has_applied']

    return features.drop(columns=['first_action', 'last_action'])

In [20]:
# --- STEP 1: GENERATE BLINDED TRAINING DATA ---
# Using the cutoff logic to simulate "live" journeys for successful customers
df_train_truncated = df_aug[df_aug['event_timestamp'] <= df_aug['cut_time']].copy()

# --- STEP 2: AGGREGATE USING ENHANCED FEATURES ---
# We use the function to create 'velocity' and 'ratios'
df_train_enhanced = extract_enhanced_features(df_train_truncated)

# --- STEP 3: FILTER OUT "IN-BETWEEN" USERS ---
# Only train on confirmed Successes or confirmed 60-day Lapsers
final_lapsed_ids = labels[labels['label'] == 0]['customer_id']
final_success_ids = labels[labels['label'] == 1]['customer_id']
valid_ids = pd.concat([final_lapsed_ids, final_success_ids])

df_train_final = df_train_enhanced[df_train_enhanced['customer_id'].isin(valid_ids)].copy()

# --- STEP 4: ATTACH LABELS ---
df_train_final['label'] = df_train_final['customer_id'].isin(success_ids).astype(int)

# --- STEP 5: PREPARE THE TEST SET ---
# Apply the EXACT same enhanced features to the open journeys
X_test_enhanced = extract_enhanced_features(df_open_raw)

# Ensure X_test has the same columns as our training features (excluding label/id)
features_list = [col for col in df_train_final.columns if col not in ['customer_id', 'label']]
X_test = X_test_enhanced[features_list].fillna(0)

In [18]:
# --- STEP 5: FILTER AND LABEL ---

# 1. Identify valid IDs (Successes + 60-day Lapsers only)
final_lapsed_ids = labels[labels['label'] == 0]['customer_id']
final_success_ids = labels[labels['label'] == 1]['customer_id']
valid_ids = pd.concat([final_lapsed_ids, final_success_ids])

# 2. Filter the aggregated features to only include these valid customers
df_train_final = df_train_final[df_train_final['customer_id'].isin(valid_ids)].copy()

# 3. Attach the final label (1 for Success, 0 for Lapse)
df_train_final['label'] = df_train_final['customer_id'].isin(success_ids).astype(int)

# --- STEP 6: FINAL CHECK ---
print(f"Final Training Samples: {len(df_train_final)}")
print(f"Target Distribution:\n{df_train_final['label'].value_counts(normalize=True)}")

Final Training Samples: 1235180
Target Distribution:
label
0    0.774735
1    0.225265
Name: proportion, dtype: float64


In [14]:
# The training features are everything except 'customer_id' and 'label'
features_list = [col for col in df_train_final.columns if col not in ['customer_id', 'label']]

# Create the final X_test using the same columns
X_test = X_test_safe[features_list].copy()

# Ensure we handle any NaNs in the test set (just in case)
X_test = X_test.fillna(0)

In [15]:
import xgboost as xgb
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

# Define X and y
X = df_train_final[features_list]
y = df_train_final['label']

# Split to validate before submitting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 1. The Base Model: Keeping it shallow for stability
xgb_model = xgb.XGBClassifier(
    max_depth=3,           # Shallow depth is better for LogLoss here
    learning_rate=0.05,    # Slow learning avoids overshooting
    n_estimators=300,
    objective='binary:logistic',
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8
)

# 2. The Calibration: The secret to the 0.042 range
# This adjusts the probabilities to be more "honest"
calibrated_xgb = CalibratedClassifierCV(xgb_model, method='isotonic', cv=5)
calibrated_xgb.fit(X_train, y_train)

# 3. Predict Probabilities
# [:, 1] gets the probability of "Success"
test_probs = calibrated_xgb.predict_proba(X_test)[:, 1]

In [16]:
# Load the submission template to get the exact ID order
sub_template = pd.read_csv('open_journeys1_flattened_all0.csv')

# Create a dictionary of our predictions for easy mapping
pred_dict = dict(zip(X_test_safe['customer_id'], test_probs))

# Map our predictions back to the template's 'id' column
# This ensures the order and format are 100% correct for the leaderboard
sub_template['order_shipped'] = sub_template['id'].map(pred_dict)

# Important: If any IDs didn't match, fill them with the global mean 
# (though your logic should cover everyone)
sub_template['order_shipped'] = sub_template['order_shipped'].fillna(y.mean())

sub_template.to_csv('submission_v1_calibrated.csv', index=False)

print("Submission file ready!")

Submission file ready!
