# üèì Table Tennis Match Prediction - Gold (Baseline)

**Version**: Gold - Clean Baseline

**Performance**: 
- Private Score: 0.3574 ‚úÖ
- Public Score: 0.3205
- Delta: +0.0369 (good generalization)

**Key Improvements over V6**:
1. ‚úÖ No information leakage (removed `rally_serve_action/point`)
2. ‚úÖ Cleaner features (removed `is_deuce`, `is_server`)
3. ‚úÖ Balanced ensemble (50-50 LGBM + CatBoost)
4. ‚úÖ No sample weighting (better generalization)
5. ‚úÖ Added `prev_action_phase` (contextual feature without leakage)

**Note**: This version provides a solid baseline with better generalization than V6.

## 1. Configuration & Setup

In [None]:
# =========================================================
# üîß SECTION 1: Configuration & Setup
# =========================================================

# Global Configuration
USE_GPU = True      # Set to True if GPU is available
N_FOLDS = 5          # 5-fold cross-validation
RANDOM_SEED = 42     # Fixed random seed

# Install required libraries
print("[1/8] Installing Libraries...")
# Uncomment the next line if running in Colab or need to install packages
# !pip -q install lightgbm catboost pandas numpy scikit-learn

# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score
import warnings
import sys
import gc

warnings.filterwarnings('ignore')

print(f"Python Version: {sys.version.split(' ')[0]}")
print(f"LightGBM Version: {lgb.__version__}")
print(f"GPU Enabled: {USE_GPU}")
print(f"Random Seed: {RANDOM_SEED}")

## 2. Load Data

In [None]:
# =========================================================
# üìä SECTION 2: Data Loading
# =========================================================

print("\n[2/8] Loading data...")

try:
    # Try loading from parent data/ directory (when running from notebooks/)
    train_df = pd.read_csv("../data/train.csv")
    test_df = pd.read_csv("../data/test.csv")
    submission_df = pd.read_csv("../data/sample_submission.csv")
    print(f"‚úì Data loaded from ../data/ directory")
except FileNotFoundError:
    try:
        # Try loading from local data/ directory
        train_df = pd.read_csv("data/train.csv")
        test_df = pd.read_csv("data/test.csv")
        submission_df = pd.read_csv("data/sample_submission.csv")
        print(f"‚úì Data loaded from data/ directory")
    except FileNotFoundError:
        print("‚ùå Error: Data files not found.")
        print("Please ensure train.csv, test.csv, and sample_submission.csv are in data/ directory.")
        sys.exit(1)

print(f"  Train shape: {train_df.shape}")
print(f"  Test shape: {test_df.shape}")
print(f"  Submission shape: {submission_df.shape}")

## 3. Feature Engineering (Gold Baseline)

In [None]:
# =========================================================
# üî® SECTION 3: Feature Engineering (Gold Version)
# =========================================================

print("\n[3/8] Feature Engineering (Gold Version)...")

def get_rally_phase(n):
    """Categorize rally by stroke number."""
    if n == 1: return 0      # Serve
    elif n == 2: return 1    # Return
    elif n <= 4: return 2    # Early rally
    else: return 3           # Extended rally

def create_features_gold(df):
    """
    Create Gold version features.
    
    Key differences from V6:
    - NO rally_serve_action/point (avoids information leakage)
    - NO is_deuce (too specific)
    - NO is_server (overfits to training distribution)
    - YES prev_action_phase (better generalization)
    """
    df_feats = df.copy()
    
    # === 1. Basic Features ===
    df_feats['rally_phase'] = df_feats['strickNumber'].apply(get_rally_phase)
    
    action_map = {
        1: 'Attack', 2: 'Attack', 3: 'Attack', 4: 'Attack', 5: 'Attack', 6: 'Attack', 7: 'Attack',
        8: 'Control', 9: 'Control', 10: 'Control', 11: 'Control',
        12: 'Defensive', 13: 'Defensive', 14: 'Defensive',
        15: 'Serve', 16: 'Serve', 17: 'Serve', 18: 'Serve',
        0: 'Zero', -1: 'Zero'
    }
    df_feats['action_type'] = df_feats['actionId'].map(action_map).fillna('Zero')
    
    # Score features
    if 'scoreSelf' in df_feats.columns and 'scoreOther' in df_feats.columns:
        df_feats['score_diff'] = df_feats['scoreSelf'] - df_feats['scoreOther']
    
    # === 2. Lag-1 Features ===
    lag1_cols = ['strickId', 'handId', 'strengthId', 'spinId', 'pointId', 
                 'actionId', 'positionId', 'action_type']
    for col in lag1_cols:
        df_feats[f'prev_{col}'] = df_feats.groupby('rally_uid')[col].shift(1)
    
    # === 3. Lag-2 Features ===
    lag2_cols = ['actionId', 'pointId', 'action_type']
    for col in lag2_cols:
        df_feats[f'prev2_{col}'] = df_feats.groupby('rally_uid')[col].shift(2)
    
    # === 4. Tactical Combinations ===
    df_feats['prev_hand_spin'] = (
        df_feats['prev_handId'].astype(str) + '_' + df_feats['prev_spinId'].astype(str)
    )
    df_feats['prev_action_point'] = (
        df_feats['prev_actionId'].astype(str) + '_' + df_feats['prev_pointId'].astype(str)
    )
    # üåü Gold's key feature: combines previous action with current rally phase
    df_feats['prev_action_phase'] = (
        df_feats['prev_actionId'].astype(str) + '_ph' + df_feats['rally_phase'].astype(str)
    )
    
    # === 5. Fill Missing Values ===
    for col in df_feats.columns:
        if 'prev' in col:
            df_feats[col] = df_feats[col].replace(
                ['nan_nan', 'nan', '<NA>', '<NA>_<NA>'], np.nan
            )
        
        if col.startswith('prev'):
            if df_feats[col].dtype == 'object':
                df_feats[col] = df_feats[col].fillna('None')
            else:
                df_feats[col] = df_feats[col].fillna(-999)
    
    return df_feats

# Create features
train_feats_df = create_features_gold(train_df)
test_feats_df = create_features_gold(test_df)

print(f"‚úì Created {len(train_feats_df.columns)} features")
print(f"  Train features shape: {train_feats_df.shape}")
print(f"  Test features shape: {test_feats_df.shape}")

## 4. Prepare Training Data

In [None]:
# =========================================================
# üéØ SECTION 4: Data Preparation
# =========================================================

print("\n[4/8] Preparing Datasets...")

# Create target variables
train_feats_df['next_actionId'] = train_feats_df.groupby('rally_uid')['actionId'].shift(-1)
train_feats_df['next_pointId'] = train_feats_df.groupby('rally_uid')['pointId'].shift(-1)
train_feats_df['rally_outcome'] = train_feats_df['serverGetPoint']

# Filter rows with valid next actions
train_next_df = train_feats_df.dropna(subset=['next_actionId', 'next_pointId']).copy()

# Define columns to drop
drop_cols = [
    'rally_uid', 'serverGetPoint', 'gamePlayerId', 'gamePlayerOtherId',
    'match_id', 'next_actionId', 'next_pointId', 'rally_outcome', 'match', 'rally_id'
]
features = [col for col in train_feats_df.columns if col not in drop_cols]

# Identify categorical features
categorical_features = []
for col in features:
    if train_feats_df[col].dtype == 'object' or 'Id' in col or 'phase' in col:
        categorical_features.append(col)

# Encode categorical features
print(f"  Encoding {len(categorical_features)} categorical features...")
for col in categorical_features:
    le = LabelEncoder()
    train_feats_df[col] = train_feats_df[col].astype(str)
    test_feats_df[col] = test_feats_df[col].astype(str)
    le.fit(pd.concat([train_feats_df[col], test_feats_df[col]]))
    train_feats_df[col] = le.transform(train_feats_df[col])
    test_feats_df[col] = le.transform(test_feats_df[col])

# Prepare datasets
X_next = train_feats_df.loc[train_next_df.index, features]
groups_next = train_next_df['rally_uid']

le_action = LabelEncoder()
y_action = le_action.fit_transform(train_next_df['next_actionId'].astype(int))

le_point = LabelEncoder()
y_point = le_point.fit_transform(train_next_df['next_pointId'].astype(int))

X_outcome = train_feats_df[features]
y_outcome = train_feats_df['rally_outcome']
groups_outcome = train_feats_df['rally_uid']

test_final_rows = test_feats_df.groupby('rally_uid').tail(1)
X_test = test_final_rows[features]
test_rally_uids = test_final_rows['rally_uid']

print(f"‚úì X_next shape: {X_next.shape}")
print(f"‚úì X_outcome shape: {X_outcome.shape}")
print(f"‚úì X_test shape: {X_test.shape}")
print(f"‚úì Number of features: {len(features)}")

## 5. Training Functions

In [None]:
# =========================================================
# ü§ñ SECTION 5: Model Training Functions
# =========================================================

def train_lgb(X, y, groups, X_test, params, cat_feats, n_splits=5):
    """Train LightGBM with GroupKFold cross-validation."""
    gkf = GroupKFold(n_splits=n_splits)
    num_class = params.get('num_class', 1)
    is_multiclass = params['objective'] == 'multiclass'
    
    oof_preds = np.zeros((len(X), num_class)) if is_multiclass else np.zeros(len(X))
    test_preds_list = []
    
    for train_idx, val_idx in gkf.split(X, y, groups):
        X_tr, y_tr = X.iloc[train_idx], y[train_idx]
        X_val, y_val = X.iloc[val_idx], y[val_idx]
        
        # Handle unseen labels in multiclass
        if is_multiclass:
            missing = set(np.unique(y_val)) - set(np.unique(y_tr))
            if missing:
                add_idx = [val_idx[np.where(y_val == label)[0][0]] for label in missing]
                X_tr = pd.concat([X_tr, X.iloc[add_idx]])
                y_tr = np.concatenate([y_tr, y[add_idx]])
        
        dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_feats)
        dval = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_feats, reference=dtrain)
        
        model = lgb.train(
            params, dtrain, valid_sets=[dval],
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
        )
        
        oof_preds[val_idx] = model.predict(X.iloc[val_idx])
        test_preds_list.append(model.predict(X_test))
    
    return oof_preds, np.mean(test_preds_list, axis=0)


def train_cat(X, y, groups, X_test, params, cat_indices, n_splits=5):
    """Train CatBoost with GroupKFold cross-validation."""
    gkf = GroupKFold(n_splits=n_splits)
    is_multiclass = 'MultiClass' in params.get('loss_function', '')
    num_class = int(np.max(y) + 1) if is_multiclass else 1
    
    oof_preds = np.zeros((len(X), num_class)) if is_multiclass else np.zeros(len(X))
    test_preds_list = []
    
    for train_idx, val_idx in gkf.split(X, y, groups):
        X_tr, y_tr = X.iloc[train_idx], y[train_idx]
        X_val, y_val = X.iloc[val_idx], y[val_idx]
        
        # Handle unseen labels in multiclass
        if is_multiclass:
            missing = set(np.unique(y_val)) - set(np.unique(y_tr))
            if missing:
                add_idx = [val_idx[np.where(y_val == label)[0][0]] for label in missing]
                X_tr = pd.concat([X_tr, X.iloc[add_idx]])
                y_tr = np.concatenate([y_tr, y[add_idx]])
        
        train_pool = Pool(X_tr, y_tr, cat_features=cat_indices)
        val_pool = Pool(X_val, y_val, cat_features=cat_indices)
        
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=0)
        
        if is_multiclass:
            oof_preds[val_idx] = model.predict_proba(val_pool)
            test_preds_list.append(model.predict_proba(X_test))
        else:
            oof_preds[val_idx] = model.predict_proba(val_pool)[:, 1]
            test_preds_list.append(model.predict_proba(X_test)[:, 1])
    
    return oof_preds, np.mean(test_preds_list, axis=0)

print("‚úì Training functions defined")

## 6. Train Models (50-50 LGBM + CatBoost)

In [None]:
# =========================================================
# üöÄ SECTION 6: Dual-Engine Training (50-50 Blending)
# =========================================================

print("\n[5/8] Starting Dual-Engine Training...")
print("  üåü Using 50-50 blending for better generalization")

# Model parameters
lgb_common = {
    'boosting_type': 'gbdt',
    'n_estimators': 3000,
    'learning_rate': 0.03,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': RANDOM_SEED,
    'n_jobs': -1,
    'verbose': -1
}

cat_common = {
    'iterations': 2000,
    'learning_rate': 0.05,
    'depth': 7,
    'random_seed': RANDOM_SEED,
    'verbose': 0
}

if USE_GPU:
    lgb_common['device'] = 'gpu'
    cat_common.update({'task_type': 'GPU', 'devices': '0'})

cat_indices = [X_next.columns.get_loc(col) for col in categorical_features]

# === Train Action ID Model ===
print("\n>> Training Action ID Predictor...")
lgb_p = {**lgb_common, 'objective': 'multiclass', 
         'num_class': len(le_action.classes_), 'metric': 'multi_logloss'}
cat_p = {**cat_common, 'loss_function': 'MultiClass', 'eval_metric': 'MultiClass'}

oof_lgb1, pred_lgb1 = train_lgb(X_next, y_action, groups_next, X_test, lgb_p, categorical_features)
oof_cat1, pred_cat1 = train_cat(X_next, y_action, groups_next, X_test, cat_p, cat_indices)

# 50-50 blending
oof_blended_action = 0.5 * oof_lgb1 + 0.5 * oof_cat1
final_proba_action = 0.5 * pred_lgb1 + 0.5 * pred_cat1

action_f1 = f1_score(y_action, np.argmax(oof_blended_action, axis=1), average='macro')
print(f"   ‚úì Action Blended OOF F1: {action_f1:.4f}")

# === Train Point ID Model ===
print("\n>> Training Point ID Predictor...")
lgb_p['num_class'] = len(le_point.classes_)

oof_lgb2, pred_lgb2 = train_lgb(X_next, y_point, groups_next, X_test, lgb_p, categorical_features)
oof_cat2, pred_cat2 = train_cat(X_next, y_point, groups_next, X_test, cat_p, cat_indices)

# 50-50 blending
oof_blended_point = 0.5 * oof_lgb2 + 0.5 * oof_cat2
final_proba_point = 0.5 * pred_lgb2 + 0.5 * pred_cat2

point_f1 = f1_score(y_point, np.argmax(oof_blended_point, axis=1), average='macro')
print(f"   ‚úì Point Blended OOF F1: {point_f1:.4f}")

# === Train Outcome Model ===
print("\n>> Training Rally Outcome Predictor...")
lgb_p_bin = {**lgb_common, 'objective': 'binary', 'metric': 'auc'}
cat_p_bin = {**cat_common, 'loss_function': 'Logloss', 'eval_metric': 'AUC'}

oof_lgb3, pred_lgb3 = train_lgb(X_outcome, y_outcome, groups_outcome, X_test, lgb_p_bin, categorical_features)
oof_cat3, pred_cat3 = train_cat(X_outcome, y_outcome, groups_outcome, X_test, cat_p_bin, cat_indices)

# 50-50 blending
oof_blended_outcome = 0.5 * oof_lgb3 + 0.5 * oof_cat3
final_proba_outcome = 0.5 * pred_lgb3 + 0.5 * pred_cat3

outcome_auc = roc_auc_score(y_outcome, oof_blended_outcome)
print(f"   ‚úì Outcome Blended OOF AUC: {outcome_auc:.4f}")

print("\n‚úì Training completed!")

## 7. Synchronize Predictions

In [None]:
# =========================================================
# üîÑ SECTION 7: Prediction Synchronization
# =========================================================

print("\n[6/8] Synchronizing Predictions...")

def synchronize_endings(prob_act, prob_pt, le_act, le_pt, threshold=0.5):
    """
    Synchronize action and point predictions for rally endings.
    Ensures both predict -1 when rally should end.
    """
    try:
        act_neg1_idx = list(le_act.classes_).index(-1)
        pt_neg1_idx = list(le_pt.classes_).index(-1)
        
        # Average ending probabilities
        p_end = (prob_act[:, act_neg1_idx] + prob_pt[:, pt_neg1_idx]) / 2
        
        # Boost ending probabilities when both agree
        prob_act_mod, prob_pt_mod = prob_act.copy(), prob_pt.copy()
        prob_act_mod[p_end >= threshold, act_neg1_idx] = 2.0
        prob_pt_mod[p_end >= threshold, pt_neg1_idx] = 2.0
        
        # Suppress ending when both disagree
        prob_act_mod[p_end < threshold, act_neg1_idx] = 0.0
        prob_pt_mod[p_end < threshold, pt_neg1_idx] = 0.0
        
        synced_count = (p_end >= threshold).sum()
        print(f"  ‚úì Synchronized {synced_count} rows to END state")
        
        return (le_act.inverse_transform(np.argmax(prob_act_mod, axis=1)),
                le_pt.inverse_transform(np.argmax(prob_pt_mod, axis=1)))
    except ValueError:
        print("  ‚ö† Warning: -1 label not found, skipping synchronization")
        return (le_act.inverse_transform(np.argmax(prob_act, axis=1)),
                le_pt.inverse_transform(np.argmax(prob_pt, axis=1)))

final_action, final_point = synchronize_endings(
    final_proba_action, final_proba_point, le_action, le_point
)

## 8. Generate Submission

In [None]:
# =========================================================
# üì§ SECTION 8: Generate Submission
# =========================================================

print("\n[7/8] Generating Submission...")

# Create submission dataframe
submission = pd.DataFrame({
    'rally_uid': test_rally_uids,
    'serverGetPoint': final_proba_outcome,
    'pointId': final_point,
    'actionId': final_action
})

# Merge with submission template to ensure all rally_uids are included
final_submission = pd.merge(submission_df[['rally_uid']], submission, on='rally_uid', how='left')

# Fill missing values
final_submission.fillna({'serverGetPoint': 0.5}, inplace=True)

# Fill missing action/point with most common values from training data
valid_action_mode = train_df['actionId'].mode()[0]
valid_point_mode = train_df['pointId'].mode()[0]
final_submission['actionId'] = final_submission['actionId'].fillna(valid_action_mode).astype(int)
final_submission['pointId'] = final_submission['pointId'].fillna(valid_point_mode).astype(int)

print(f"‚úì Submission created: {final_submission.shape}")
print(f"‚úì First 5 rows:")
print(final_submission.head())

In [None]:
# Save to file
output_file = '../submissions/submission_gold.csv'
final_submission.to_csv(output_file, index=False)

print(f"  ‚úì Saved to '{output_file}'")
print(f"\n  Summary:")
print(f"    Total predictions: {len(final_submission)}")
print(f"    Action -1 (end): {(final_submission['actionId'] == -1).sum()}")
print(f"    Point -1 (end): {(final_submission['pointId'] == -1).sum()}")
print(f"    Mean serverGetPoint: {final_submission['serverGetPoint'].mean():.4f}")

print("\n" + "="*60)
print("‚úÖ DONE! Gold version submission generated.")
print("="*60)
print(f"\nüìä OOF Performance Summary:")
print(f"  Action F1:  {action_f1:.4f}")
print(f"  Point F1:   {point_f1:.4f}")
print(f"  Outcome AUC: {outcome_auc:.4f}")
print(f"\nüí° Expected: Better generalization than V6 version")
print(f"   (Higher Private score, lower Public score)")