In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import gc

import pandas as pd
import numpy as np
import gc

# 1. DEFINE A SCHEMA
# Loading 'event_name' as a category immediately saves ~80% RAM
# We skip loading 'customer_id' if it's already in the file, or use only what's needed
dtypes = {
    'id': 'str',
    'event_name': 'category'
}

# 2. LOAD INDIVIDUALLY AND CLEAN
def load_and_clean(file_path):
    # Load with dtypes to save memory instantly
    df = pd.read_csv(file_path, dtype=dtypes)
    
    # Process datetime immediately to free up string memory
    df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], utc=True, errors='coerce')
    
    # Process IDs without creating intermediate lists
    # .str.extract is often more memory-efficient than .split for large data
    df['customer_id'] = df['id'].str.extract(r'^([^\s]+)', expand=False)
    
    gc.collect() # Clean artifacts from extraction
    return df

# Process one at a time
df_train2 = load_and_clean('dat_train2.csv')
gc.collect()

df_test2 = load_and_clean('open_journeys2.csv')
gc.collect()

# 3. IDENTIFY SUCCESS (REMAINING LOGIC)
success_ids = df_train2.loc[df_train2['event_name'] == 'order_shipped', 'id'].unique()
success_set = set(success_ids) # O(1) lookup speed

print(f"Setup Complete. Training rows: {len(df_train2)}")
gc.collect() # Clear temporary memory artifacts

# --- STEP 2: VECTORIZED LABELING (FIXED FOR MEMORY) ---

# 1. Get the last event timestamp ONLY for unique IDs
# Better than transform('max') here because it creates a smaller object
journey_end_times = df_train2.groupby('id')['event_timestamp'].max()

# 2. Identify the 'Present Day'
max_date = df_train2['event_timestamp'].max()

# 3. Create a dedicated Labels DataFrame (Much smaller than the full event log)
train_labels = pd.DataFrame({'id': df_train2['id'].unique()})

# 4. Map the end times and calculate days since last event for everyone
train_labels['last_event'] = train_labels['id'].map(journey_end_times)
train_labels['days_since_last'] = (max_date - train_labels['last_event'])

# 5. Vectorized Logical Conditions
is_success = train_labels['id'].isin(success_set)
is_lapsed = train_labels['days_since_last'] >= pd.Timedelta(days=60)

# 6. Apply Labels (1 = Success, 0 = Lapsed, -1 = Still Active)
# np.select is much faster than apply() for large columns
train_labels['label'] = np.select(
    [is_success, is_lapsed], 
    [1, 0], 
    default=-1
)

# 7. Final Training Filter
# Keep only confirmed wins and losses; discard journeys still in progress
train_labels = train_labels[train_labels['label'] != -1][['id', 'label']].copy()

print(f"Labeling complete. Found {len(train_labels)} valid journeys for training.")
print(f"Success Rate: {train_labels['label'].mean():.2%}")

# Memory cleanup of temporary objects
del journey_end_times
gc.collect()

In [None]:
# random cutoff logic

# 1. IDENTIFY JOURNEY BOUNDARIES
# We need to know when each journey started and when it "effectively" ended
journey_bounds = df_train2.groupby('id')['event_timestamp'].agg(['min', 'max']).reset_index()
journey_bounds.columns = ['id', 'start_time', 'end_time']

# 2. GENERATE RANDOM CUTOFFS
# We want a random time between the first event and the last event
np.random.seed(42)

# Convert to nanoseconds for easy integer-based random sampling
start_ns = journey_bounds['start_time'].astype('int64')
end_ns = journey_bounds['end_time'].astype('int64')

# Generate the cut time
journey_bounds['cut_ns'] = start_ns + (np.random.rand(len(journey_bounds)) * (end_ns - start_ns)).astype('int64')
journey_bounds['cut_time'] = pd.to_datetime(journey_bounds['cut_ns'], utc=True)

# 3. APPLY THE CUTOFF TO THE DATA
# Merge the cut_time back to the main dataframe
df_train2 = df_train2.merge(journey_bounds[['id', 'cut_time']], on='id', how='left')

# Keep only events that happened BEFORE the random cut-off
df_train_truncated = df_train2[df_train2['event_timestamp'] <= df_train2['cut_time']].copy()

# 4. CLEAN UP
# We no longer need the full 60M row dataframe or the bounds
del df_train2, journey_bounds
gc.collect()

print(f"Truncation complete. Reduced rows from 59M to {len(df_train_truncated)}")

Truncation complete. Reduced rows from 59M to 39244358


In [4]:
def extract_elite_features_v4(df_input):
    df = df_input.copy()
    
    # 1. Handle the "Reference Time" (The Cutoff)
    # If cut_time isn't there, use the journey's own last event time
    if 'cut_time' not in df.columns:
        df['reference_time'] = df.groupby('id')['event_timestamp'].transform('max')
    else:
        df['reference_time'] = df['cut_time']
    
    # 2. Pre-calculate Intent Flags
    df['is_cart'] = (df['event_name'] == 'add_to_cart').astype(int)
    
    # 3. Recency Decay (Lambda 0.04)
    # Use reference_time instead of cut_time to avoid KeyErrors
    df['hrs_ago'] = (df['reference_time'] - df['event_timestamp']).dt.total_seconds() / 3600.0
    df['weight'] = np.exp(-0.04 * df['hrs_ago'])
    df['w_cart'] = df['is_cart'] * df['weight']
    
    # 4. Aggregation
    features = df.groupby('id').agg(
        recency_score=('weight', 'sum'),
        weighted_carts=('w_cart', 'sum'),
        total_acts=('event_name', 'count'),
        duration_hrs=('hrs_ago', 'max'),
        event_variety=('event_name', 'nunique')
    ).reset_index()
    
    # 5. Feature Engineering
    features['velocity'] = features['total_acts'] / (features['duration_hrs'] + 0.1)
    features['cart_density'] = features['weighted_carts'] / (features['total_acts'] + 1)
    
    return features

# --- EXECUTION ---
print("Flattening Truncated Training Data...")
X_train_full = extract_elite_features_v4(df_train_truncated)
df_final_train = X_train_full.merge(train_labels, on='id', how='inner')

print("Flattening Test Data...")
X_test_final = extract_elite_features_v4(df_test2)

# Memory Cleanup
del X_train_full
gc.collect()

print(f"Training Matrix Shape: {df_final_train.shape}")
print(f"Test Matrix Shape: {X_test_final.shape}")

Flattening Truncated Training Data...
Flattening Test Data...
Training Matrix Shape: (1450231, 9)
Test Matrix Shape: (158325, 8)


In [5]:
import xgboost as xgb

# 1. Define Features (Excluding ID and Label)
features_list = ['recency_score', 'weighted_carts', 'total_acts', 
                 'duration_hrs', 'event_variety', 'velocity', 'cart_density']

X = df_final_train[features_list]
y = df_final_train['label']

# 2. Train Ultra-Conservative XGBoost
model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=3,           # Shallow trees prevent overfitting
    learning_rate=0.05,
    gamma=10,              # High penalty for making complex splits
    reg_lambda=50,         # Strong L2 regularization
    min_child_weight=20,   # Requires more proof before creating a leaf
    objective='binary:logistic',
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost Model...")
model.fit(X, y)

# 3. Generate Raw Test Probabilities
test_probs_raw = model.predict_proba(X_test_final[features_list])[:, 1]

# 4. The 0.041 Probability Calibration
# Squash the probabilities to handle the sparse test set
test_probs_squashed = np.power(test_probs_raw, 1.5)

# Shift the mean to a pessimistic target (e.g., 0.035)
target_mean = 0.035
final_probs = test_probs_squashed * (target_mean / test_probs_squashed.mean())

# Final safety clip
final_probs = np.clip(final_probs, 0.0001, 0.15)

Training XGBoost Model...


In [6]:
# 1. Load the official Kaggle template
df_kaggle_template = pd.read_csv('open_journeys2_flattened_all0.csv')


In [7]:
# 2. Create a lookup Series from your predictions
# We use the 'id' from X_test_final as the index for a quick lookup
preds_series = pd.Series(final_probs, index=X_test_final['id'].astype(str))

# 3. Map predictions to the template
# We use .map() to match the 'id' column in the template to our predictions
# .fillna(0.015) is a safety net for any IDs that had 0 events in the test log
df_kaggle_template['order_shipped'] = df_kaggle_template['id'].map(preds_series).fillna(0.015)

# 4. Final Verification
print(f"Submission Mean: {df_kaggle_template['order_shipped'].mean():.4f}")
print(f"Max Prob: {df_kaggle_template['order_shipped'].max():.4f}")
print(f"Min Prob: {df_kaggle_template['order_shipped'].min():.4f}")

# 5. Save the file
submission_name = 'xgboost_comp2_v1.csv'
df_kaggle_template[['id', 'order_shipped']].to_csv(submission_name, index=False)

print(f"\nSuccessfully saved to {submission_name}")

Submission Mean: 0.0161
Max Prob: 0.1500
Min Prob: 0.0011

Successfully saved to xgboost_comp2_v1.csv


In [1]:
import matplotlib.pyplot as plt

# Get feature importance
importance = model.feature_importances_
feat_imp = pd.Series(importance, index=features_list).sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
feat_imp.plot(kind='barh', color='teal')
plt.title('Which Features are Driving the 0.0407 Score?')
plt.show()

NameError: name 'model' is not defined