# Intellectra 2025 – Next-Buy Prediction

End-to-end workflow to maximise **Balanced Accuracy**.

In [30]:
# Cell 0 – Install (run once, skip if libs already installed)
!pip install -q lightgbm optuna pandas numpy scikit-learn


[notice] A new release of pip is available: 24.1.2 -> 25.1.1
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [31]:
# IMPROVED SOLUTION FOR BETTER SCORE
# ===========================================

import pandas as pd
import numpy as np
import warnings
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import lightgbm as lgb
import optuna
from datetime import datetime, timedelta
import gc

warnings.filterwarnings('ignore')

DATA_DIR = Path(r"F:/lomba/intelectra/dataset")

In [32]:
# ================================
# 1. ENHANCED DATA LOADING
# ================================
print("🔄 Loading and preprocessing data...")

# Load data with proper parsing
member = pd.read_csv(DATA_DIR/'member_data.csv', 
                    parse_dates=['JoinDate', 'DateOfBirth', 'EldestKidDOB', 'YoungestKidDOB'])
product = pd.read_csv(DATA_DIR/'product_data.csv', 
                     names=['productID', 'ProductName', 'ProductCategory', 'ProductLevel'])
program = pd.read_csv(DATA_DIR/'program_data.csv')
train_trx = pd.read_csv(DATA_DIR/'train_transaction_data.csv', 
                       parse_dates=['TransactionDatetime'])
test_trx = pd.read_csv(DATA_DIR/'test_transaction_data.csv', 
                      parse_dates=['TransactionDatetime'])
train_lb = pd.read_csv(DATA_DIR/'train_label_data.csv')
sample = pd.read_csv(DATA_DIR/'sample_submission.csv')

🔄 Loading and preprocessing data...


## 1 – Data cleaning helpers

In [33]:
# ================================
# 2. ADVANCED DATA PREPROCESSING
# ================================

def preprocess_transactions(df):
    """Advanced transaction preprocessing"""
    df = df.copy()
    
    # Handle missing values more intelligently
    df['PricePerUnit'] = df['PricePerUnit'].fillna(df.groupby('FK_PRODUCT_ID')['PricePerUnit'].transform('median'))
    df['PricePerUnit'] = df['PricePerUnit'].fillna(df['PricePerUnit'].median())
    
    # Fix quantity issues
    df['Qty'] = pd.to_numeric(df['Qty'], errors='coerce').fillna(1).astype(int)
    df['Qty'] = df['Qty'].clip(lower=1, upper=50)  # Remove extreme outliers
    
    # Calculate amount
    df['Amount'] = df['Qty'] * df['PricePerUnit']
    
    # Convert IDs to string for consistent merging
    df['FK_PRODUCT_ID'] = df['FK_PRODUCT_ID'].astype(str)
    df['FK_PROD_GRAM_ID'] = df['FK_PROD_GRAM_ID'].astype(str)
    
    # Extract time features
    df['Hour'] = df['TransactionDatetime'].dt.hour
    df['DayOfWeek'] = df['TransactionDatetime'].dt.dayofweek
    df['Month'] = df['TransactionDatetime'].dt.month
    df['Quarter'] = df['TransactionDatetime'].dt.quarter
    
    return df

# Preprocess transactions
train_trx = preprocess_transactions(train_trx)
test_trx = preprocess_transactions(test_trx)

# Fix product and program data types
product['productID'] = product['productID'].astype(str)
program['prodgramID'] = program['prodgramID'].astype(str)

# Merge with product and program data
train_trx = train_trx.merge(product, left_on='FK_PRODUCT_ID', right_on='productID', how='left')
train_trx = train_trx.merge(program, left_on='FK_PROD_GRAM_ID', right_on='prodgramID', how='left')
test_trx = test_trx.merge(product, left_on='FK_PRODUCT_ID', right_on='productID', how='left')
test_trx = test_trx.merge(program, left_on='FK_PROD_GRAM_ID', right_on='prodgramID', how='left')

print(f"✅ Data preprocessed - Train: {train_trx.shape}, Test: {test_trx.shape}")


✅ Data preprocessed - Train: (130854, 21), Test: (21098, 21)


## 2 – Join product & program info ke transaksi

In [34]:
# ================================
# 3. COMPREHENSIVE FEATURE ENGINEERING  ✅ FIXED
# ================================

def create_advanced_features(df, prefix=""):
    """Create comprehensive transaction features"""
    print(f"🔄 Creating advanced features ...")
    
    df = df.copy()

    # ===== Basic aggregations =====
    basic_aggs = {
        'TransactionID': 'nunique',
        'Qty'         : ['sum', 'mean', 'std', 'max', 'min'],
        'Amount'      : ['sum', 'mean', 'std', 'max', 'min'],
        'PricePerUnit': ['mean', 'std', 'max', 'min'],
        'FK_PRODUCT_ID': 'nunique',
        'Source'       : 'nunique',
        'Hour'         : ['mean', 'std'],
        'DayOfWeek'    : ['mean', 'std'],
        'Month'        : 'nunique',
        'TransactionDatetime': ['min', 'max', 'count']
    }

    features = df.groupby('MemberID').agg(basic_aggs)
    features.columns = [
        f"{prefix}{col[0]}_{col[1]}" if isinstance(col, tuple) else f"{prefix}{col}"
        for col in features.columns
    ]

    # ===== Time-based features =====
    max_date = df['TransactionDatetime'].max()
    features[f'{prefix}recency_days'] = (max_date - features[f'{prefix}TransactionDatetime_max']).dt.days
    features[f'{prefix}span_days']    = (features[f'{prefix}TransactionDatetime_max'] -
                                         features[f'{prefix}TransactionDatetime_min']).dt.days
    features[f'{prefix}frequency']    = features[f'{prefix}TransactionDatetime_count'] / (
                                         features[f'{prefix}span_days'] + 1)

    # ===== Behavioral features =====
    features[f'{prefix}avg_basket_size'] = features[f'{prefix}Amount_sum'] / features[f'{prefix}TransactionID_nunique']
    features[f'{prefix}price_consistency'] = features[f'{prefix}PricePerUnit_std'] / (features[f'{prefix}PricePerUnit_mean'] + 1)
    features[f'{prefix}qty_consistency']   = features[f'{prefix}Qty_std'] / (features[f'{prefix}Qty_mean'] + 1)

    # ===== Category-level features (jika ada) =====
    if 'ProductCategory' in df.columns:
        cat_feat = df.groupby('MemberID')['ProductCategory'].agg(['nunique']).add_prefix(f'{prefix}category_')
        features = features.join(cat_feat)

    if 'ProductLevel' in df.columns:
        lvl_feat = df.groupby('MemberID')['ProductLevel'].agg(['nunique']).add_prefix(f'{prefix}level_')
        features = features.join(lvl_feat)

    # ===== Source diversity =====
    if 'Source' in df.columns:
        source_entropy = df.groupby('MemberID')['Source'].apply(
            lambda x: -np.sum((x.value_counts(normalize=True) * np.log(x.value_counts(normalize=True) + 1e-10)))
        ).rename(f'{prefix}source_entropy')
        features = features.join(source_entropy)

    # Drop intermediate datetime cols & fill NA
    datetime_cols = [c for c in features.columns if 'TransactionDatetime' in c]
    features = features.drop(columns=datetime_cols).fillna(0)

    print(f"✅ Created {features.shape[1]} features for {features.shape[0]} members")
    return features.reset_index()

# ======= 🔧 FIX: gunakan prefix YANG SAMA (kosong) untuk train & test =======
train_features = create_advanced_features(train_trx, prefix="")
test_features  = create_advanced_features(test_trx,  prefix="")

print(f"✅ Feature shapes aligned. Train: {train_features.shape}, Test: {test_features.shape}")


🔄 Creating advanced features ...
✅ Created 31 features for 40020 members
🔄 Creating advanced features ...
✅ Created 31 features for 6381 members
✅ Feature shapes aligned. Train: (40020, 32), Test: (6381, 32)


## 3 – Feature engineering (transaction → member level)

In [35]:
# ================================
# 4. ENHANCED MEMBER DEMOGRAPHICS
# ================================

def create_member_features(member_df):
    """Create enhanced member demographic features"""
    print("🔄 Creating enhanced member features...")
    
    df = member_df.copy()
    reference_date = pd.Timestamp('2020-07-01')
    
    # Age calculations with better handling
    df['MemberAge'] = (reference_date - df['DateOfBirth']).dt.days / 365.25
    df['MemberAge'] = df['MemberAge'].clip(0, 100).fillna(df['MemberAge'].median())
    
    # Membership duration
    df['MemberSeniority_days'] = (reference_date - df['JoinDate']).dt.days
    df['MemberSeniority_months'] = df['MemberSeniority_days'] / 30.44
    
    # Children features
    df['NoOfChild'] = df['NoOfChild'].fillna(0).astype(int)
    df['HasChildren'] = (df['NoOfChild'] > 0).astype(int)
    
    # Age group categorization
    df['AgeGroup'] = pd.cut(df['MemberAge'], 
                           bins=[0, 25, 35, 45, 55, 100], 
                           labels=['Young', 'YoungAdult', 'MiddleAge', 'Mature', 'Senior'])
    
    # City encoding with frequency
    city_counts = df['City'].value_counts()
    df['CityFrequency'] = df['City'].map(city_counts)
    df['CityTier'] = pd.cut(df['CityFrequency'], 
                           bins=[0, 100, 500, 1000, 10000], 
                           labels=['SmallCity', 'MediumCity', 'LargeCity', 'MegaCity'])
    
    # Encode categorical variables
    le_city = LabelEncoder()
    df['City_encoded'] = le_city.fit_transform(df['City'].fillna('Unknown'))
    
    le_age_group = LabelEncoder()
    df['AgeGroup_encoded'] = le_age_group.fit_transform(df['AgeGroup'].astype(str))
    
    le_city_tier = LabelEncoder()
    df['CityTier_encoded'] = le_city_tier.fit_transform(df['CityTier'].astype(str))
    
    # Select final features
    feature_cols = ['MemberID', 'MemberAge', 'MemberSeniority_days', 'MemberSeniority_months',
                   'NoOfChild', 'HasChildren', 'CityFrequency', 'City_encoded', 
                   'AgeGroup_encoded', 'CityTier_encoded']
    
    return df[feature_cols].fillna(0)

member_features = create_member_features(member)

🔄 Creating enhanced member features...


In [36]:
# ================================
# 5. DATASET ASSEMBLY
# ================================

print("🔄 Assembling final dataset...")

# Merge training data
train_df = train_lb.merge(train_features, on='MemberID', how='left')
train_df = train_df.merge(member_features, on='MemberID', how='left')

# Merge test data  
test_df = sample[['MemberID']].merge(test_features, on='MemberID', how='left')
test_df = test_df.merge(member_features, on='MemberID', how='left')

# Fill remaining missing values
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# Create feature matrix
X = train_df.drop(columns=['MemberID', 'next_buy'])
y = train_df['next_buy']
test_X = test_df.drop(columns=['MemberID'])

print(f"✅ Final dataset shapes:")
print(f"Training: {X.shape}, Target: {y.shape}")
print(f"Test: {test_X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

🔄 Assembling final dataset...
✅ Final dataset shapes:
Training: (40020, 40), Target: (40020,)
Test: (6381, 40)
Target distribution: {0: 37582, 1: 2438}


In [37]:
# ================================
# 6. ADVANCED MODEL TRAINING
# ================================

def objective(trial):
    """Enhanced Optuna objective function"""
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 512),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 300),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10.0),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'verbosity': -1,
        'random_state': 42,
        'n_jobs': -1,
        'is_unbalance': True  # Handle class imbalance
    }
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=2000,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        
        # Use probability predictions for better threshold optimization
        y_pred_proba = model.predict(X_val)
        
        # Find optimal threshold
        thresholds = np.arange(0.1, 0.9, 0.05)
        best_score = 0
        for thresh in thresholds:
            y_pred = (y_pred_proba > thresh).astype(int)
            score = balanced_accuracy_score(y_val, y_pred)
            best_score = max(best_score, score)
        
        scores.append(best_score)
    
    return np.mean(scores)

print("🚀 Starting enhanced hyperparameter optimization...")

# Create study with better configuration
study = optuna.create_study(
    direction='maximize',
    study_name='enhanced_lgb_optimization',
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=30)
)

# Run optimization
study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f"✅ Optimization completed!")
print(f"🏆 Best score: {study.best_value:.4f}")
print(f"🎯 Best params: {study.best_params}")

[I 2025-07-01 22:25:30,362] A new study created in memory with name: enhanced_lgb_optimization


🚀 Starting enhanced hyperparameter optimization...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-07-01 22:25:42,945] Trial 0 finished with value: 0.8660650614733528 and parameters: {'learning_rate': 0.11051272527046793, 'num_leaves': 201, 'min_data_in_leaf': 10, 'feature_fraction': 0.736992007312699, 'bagging_fraction': 0.7913662737877938, 'bagging_freq': 2, 'lambda_l1': 1.329086346108448, 'lambda_l2': 7.762039690223464, 'min_gain_to_split': 0.15429203422932836, 'max_depth': 11}. Best is trial 0 with value: 0.8660650614733528.
[I 2025-07-01 22:25:45,240] Trial 1 finished with value: 0.8369284858504678 and parameters: {'learning_rate': 0.14146911497724704, 'num_leaves': 127, 'min_data_in_leaf': 149, 'feature_fraction': 0.6339084207099448, 'bagging_fraction': 0.8268094334899073, 'bagging_freq': 2, 'lambda_l1': 1.657554723183745, 'lambda_l2': 9.674601250480897, 'min_gain_to_split': 0.9604402669005908, 'max_depth': 8}. Best is trial 0 with value: 0.8660650614733528.
[I 2025-07-01 22:25:47,410] Trial 2 finished with value: 0.871227174006321 and parameters: {'learning_rate': 0.0

In [38]:
# ================================
# 7. FINAL MODEL TRAINING WITH OPTIMAL THRESHOLD
# ================================

best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'random_state': 42,
    'n_jobs': -1,
    'is_unbalance': True
})

print("🔄 Training final ensemble with optimal threshold...")

# Cross-validation with threshold optimization
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=2025)  # Increased folds
oof_predictions = np.zeros(len(X))
test_predictions = []
models = []
optimal_thresholds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Training fold {fold + 1}/{skf.n_splits}...")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=5000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(200, verbose=False)]
    )
    
    models.append(model)
    
    # Get validation predictions
    val_pred_proba = model.predict(X_val)
    oof_predictions[val_idx] = val_pred_proba
    
    # Find optimal threshold for this fold
    thresholds = np.arange(0.1, 0.9, 0.01)
    best_threshold = 0.5
    best_score = 0
    
    for thresh in thresholds:
        y_pred = (val_pred_proba > thresh).astype(int)
        score = balanced_accuracy_score(y_val, y_pred)
        if score > best_score:
            best_score = score
            best_threshold = thresh
    
    optimal_thresholds.append(best_threshold)
    print(f"  Fold {fold + 1} - Best threshold: {best_threshold:.3f}, Score: {best_score:.4f}")
    
    # Test predictions for this fold
    test_pred = model.predict(test_X)
    test_predictions.append(test_pred)
    
    gc.collect()

# Calculate final scores
final_threshold = np.mean(optimal_thresholds)
oof_binary = (oof_predictions > final_threshold).astype(int)
final_oof_score = balanced_accuracy_score(y, oof_binary)

print(f"\n✅ Cross-validation completed!")
print(f"🎯 Optimal threshold: {final_threshold:.4f}")
print(f"🏆 Final OOF Balanced Accuracy: {final_oof_score:.4f}")
print(f"📊 OOF predictions distribution: {np.bincount(oof_binary)}")

🔄 Training final ensemble with optimal threshold...
Training fold 1/7...
  Fold 1 - Best threshold: 0.250, Score: 0.8871
Training fold 2/7...
  Fold 2 - Best threshold: 0.210, Score: 0.8867
Training fold 3/7...
  Fold 3 - Best threshold: 0.200, Score: 0.8854
Training fold 4/7...
  Fold 4 - Best threshold: 0.210, Score: 0.8856
Training fold 5/7...
  Fold 5 - Best threshold: 0.170, Score: 0.8722
Training fold 6/7...
  Fold 6 - Best threshold: 0.210, Score: 0.8702
Training fold 7/7...
  Fold 7 - Best threshold: 0.220, Score: 0.8894

✅ Cross-validation completed!
🎯 Optimal threshold: 0.2100
🏆 Final OOF Balanced Accuracy: 0.8821
📊 OOF predictions distribution: [33286  6734]


## 4 – Dataset assembly

In [39]:
# ================================
# 8. FINAL PREDICTIONS AND SUBMISSION
# ================================

print("🔄 Generating final test predictions...")

# Ensemble test predictions
final_test_predictions = np.mean(test_predictions, axis=0)
final_test_binary = (final_test_predictions > final_threshold).astype(int)

# Create submission
submission = sample.copy()
submission['next_buy'] = final_test_binary

print(f"📊 Test predictions distribution: {submission['next_buy'].value_counts().to_dict()}")

# Save submission
submission.to_csv('improved_submission.csv', index=False)
print("✅ Improved submission saved to 'improved_submission.csv'")

🔄 Generating final test predictions...
📊 Test predictions distribution: {0: 5328, 1: 1053}
✅ Improved submission saved to 'improved_submission.csv'


## 5 – Modeling helpers

In [40]:
# ================================
# 9. FEATURE IMPORTANCE ANALYSIS
# ================================

print("\n📊 Feature importance analysis...")

# Calculate feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.mean([model.feature_importance(importance_type='gain') for model in models], axis=0)
}).sort_values('importance', ascending=False)

print("🔝 Top 15 most important features:")
for i, (_, row) in enumerate(feature_importance.head(15).iterrows()):
    print(f"{i+1:2d}. {row['feature']}: {row['importance']:.2f}")

print(f"\n🎉 Model training completed!")
print(f"📈 Expected significant improvement in score!")
print(f"💡 Key improvements made:")
print("   - Enhanced feature engineering with behavioral patterns")
print("   - Advanced hyperparameter optimization")
print("   - Optimal threshold search")
print("   - Increased cross-validation folds")
print("   - Better handling of class imbalance")
print("   - More sophisticated member demographics")


📊 Feature importance analysis...
🔝 Top 15 most important features:
 1. recency_days: 673854.74
 2. PricePerUnit_max: 63266.31
 3. Amount_sum: 37768.80
 4. span_days: 21083.97
 5. PricePerUnit_mean: 18145.23
 6. MemberSeniority_days: 12078.41
 7. frequency: 10965.93
 8. City_encoded: 8042.70
 9. CityFrequency: 7939.57
10. TransactionID_nunique: 7869.04
11. PricePerUnit_min: 4764.17
12. Amount_max: 3088.03
13. Amount_min: 2934.44
14. PricePerUnit_std: 2537.90
15. Hour_mean: 2486.39

🎉 Model training completed!
📈 Expected significant improvement in score!
💡 Key improvements made:
   - Enhanced feature engineering with behavioral patterns
   - Advanced hyperparameter optimization
   - Optimal threshold search
   - Increased cross-validation folds
   - Better handling of class imbalance
   - More sophisticated member demographics
