In [1]:
import pandas as pd
def reshape_to_wide(df):
    id_cols = ['ESN', 'Cycles_Since_New'] 
    static_cols = ['Cumulative_WWs', 'Cumulative_HPC_SVs', 'Cumulative_HPT_SVs',
                   'Cycles_to_WW', 'Cycles_to_HPC_SV', 'Cycles_to_HPT_SV']
    sensor_cols = [c for c in df.columns if c not in id_cols + static_cols + ['Snapshot']]

    df_wide = df.pivot_table(
        index=id_cols, 
        columns='Snapshot', 
        values=sensor_cols
    )

    df_wide.columns = [f"{col}_SS{snap}" for col, snap in df_wide.columns]
    
    df_static = df.groupby(id_cols)[static_cols].first()
    
    final_df = df_wide.join(df_static).reset_index()
    
    return final_df

In [10]:
df = pd.read_csv("training_data.csv")
df_wide = reshape_to_wide(df)
df_wide.describe()

Unnamed: 0,ESN,Cycles_Since_New,Sensed_Altitude_SS1,Sensed_Altitude_SS2,Sensed_Altitude_SS3,Sensed_Altitude_SS4,Sensed_Altitude_SS5,Sensed_Altitude_SS6,Sensed_Altitude_SS7,Sensed_Altitude_SS8,...,Sensed_WFuel_SS5,Sensed_WFuel_SS6,Sensed_WFuel_SS7,Sensed_WFuel_SS8,Cumulative_WWs,Cumulative_HPC_SVs,Cumulative_HPT_SVs,Cycles_to_WW,Cycles_to_HPC_SV,Cycles_to_HPT_SV
count,8004.0,8004.0,8004.0,8004.0,8004.0,8004.0,3674.0,8004.0,8004.0,8004.0,...,3633.0,7914.0,7914.0,7914.0,8004.0,8004.0,8004.0,8004.0,8004.0,8004.0
mean,102.5,10000.0,722.538926,777.957217,7896.633379,20443.529056,29765.319746,26947.427357,26969.804543,26962.724833,...,0.484201,0.52306,0.498605,0.483981,9.561969,0.739755,2.235007,491.927786,3812.687406,1682.491254
std,1.118104,5776.7496,884.366464,882.809931,9362.584213,405.038595,773.699297,3714.554671,3704.166238,3708.350008,...,0.213515,0.260859,0.059421,0.06538,5.823621,0.731378,1.739967,291.861312,2457.22497,1073.631703
min,101.0,0.0,-489.554524,-435.554524,-53.678484,18999.445476,28999.445476,19954.321516,19946.280264,19941.321516,...,0.0,0.0,0.226675,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,101.75,5000.0,343.321516,401.321516,2719.280264,20219.554225,29002.280264,23004.321516,23002.321516,23004.554225,...,0.467751,0.472183,0.461201,0.446566,4.0,0.0,1.0,240.0,1660.0,770.0
50%,102.5,10000.0,611.554225,670.554225,3265.321516,20483.445476,30022.383496,28005.280264,28000.321516,28004.321516,...,0.485161,0.511504,0.495354,0.485301,10.0,1.0,2.0,490.0,3585.0,1595.0
75%,103.25,15000.0,887.445476,946.445476,5565.554225,20738.280264,30497.098754,29016.379693,29012.321516,29014.554225,...,0.504038,0.561556,0.534901,0.524295,15.0,1.0,4.0,740.0,5790.0,2480.0
max,104.0,20000.0,7922.445476,7962.445476,35001.554225,21009.554225,31002.554225,35018.554225,35010.554225,35019.554225,...,11.905331,19.42042,0.713363,0.692309,20.0,2.0,6.0,1160.0,9530.0,4790.0


In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupShuffleSplit

# --- 1. Scoring Function (From Challenge Spec) ---
def time_weighted_error(y_true, y_pred, alpha=0.02, beta=1):
    error = y_pred - y_true
    # Late predictions (error > 0) penalized 2x more
    weight = np.where(
        error >= 0,
        2 / (1 + alpha * y_true),
        1 / (1 + alpha * y_true)
    )
    return weight * (error ** 2) * beta

def evaluate_submission(df_true, df_pred):
    # Calculate scores for all 3 targets independently
    scores = []
    
    # 1. WW
    s_ww = np.mean(time_weighted_error(
        df_true['Cycles_to_WW'].values, df_pred['Cycles_to_WW'].values, 
        alpha=0.01, beta=1/float(df_true['Cycles_to_WW'].max())
    ))
    scores.append(s_ww)

    # 2. HPC
    s_hpc = np.mean(time_weighted_error(
        df_true['Cycles_to_HPC_SV'].values, df_pred['Cycles_to_HPC_SV'].values, 
        alpha=0.01, beta=2/float(df_true['Cycles_to_HPC_SV'].max())
    ))
    scores.append(s_hpc)

    # 3. HPT
    s_hpt = np.mean(time_weighted_error(
        df_true['Cycles_to_HPT_SV'].values, df_pred['Cycles_to_HPT_SV'].values, 
        alpha=0.01, beta=2/float(df_true['Cycles_to_HPT_SV'].max())
    ))
    scores.append(s_hpt)
    
    return np.mean(scores), scores

# --- 2. Data Preparation ---
# Assuming 'df_wide' is your reshaped dataframe from the previous step
# Features are everything except ID and Target columns
features = [c for c in df_wide.columns if 'Sensed_' in c or 'Cumulative_' in c]
targets = ['Cycles_to_WW', 'Cycles_to_HPC_SV', 'Cycles_to_HPT_SV']
groups = df_wide['ESN']

# Split engines: 80% engines for training, 20% for validation
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, val_idx = next(gss.split(df_wide, groups=groups))

X_train = df_wide.iloc[train_idx][features]
y_train = df_wide.iloc[train_idx][targets]
X_val = df_wide.iloc[val_idx][features]
y_val = df_wide.iloc[val_idx][targets]

# --- 3. Training (Multi-Output Strategy) ---
# We train 3 separate XGBoost models. XGBoost is efficient enough to not need a single multi-output model.
models = {}
preds_val = pd.DataFrame(index=X_val.index)

print(f"Training Baseline on {len(features)} features...")

for target in targets:
    print(f"  -> Fitting {target}...")
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        objective='reg:squarederror', 
        n_jobs=-1,
        random_state=42
    )
    
    model.fit(X_train, y_train[target])
    models[target] = model
    preds_val[target] = model.predict(X_val)

# --- 4. Evaluation ---
final_score, component_scores = evaluate_submission(y_val, preds_val)

print("-" * 30)
print(f"BASELINE SCORE: {final_score:.6f}")
print(f"Breakdown -> WW: {component_scores[0]:.6f}, HPC: {component_scores[1]:.6f}, HPT: {component_scores[2]:.6f}")
print("-" * 30)

Training Baseline on 131 features...
  -> Fitting Cycles_to_WW...
  -> Fitting Cycles_to_HPC_SV...
  -> Fitting Cycles_to_HPT_SV...
------------------------------
BASELINE SCORE: 23.400051
Breakdown -> WW: 30.538735, HPC: 14.067881, HPT: 25.593536
------------------------------


In [9]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.impute import SimpleImputer # <--- Added this

# Note: Ensure 'time_weighted_error' is defined from previous cells

class LazyCompetitionEvaluator:
    def __init__(self, X_train, y_train, X_val, y_val):
        # 1. Handle NaNs globally for consistency
        # We fit the imputer ONLY on training data to avoid data leakage
        print("Preprocessing: Imputing missing values...")
        self.imputer = SimpleImputer(strategy='mean')
        
        # We need to keep pandas index/columns for clarity, so we wrap the output
        self.X_train = pd.DataFrame(
            self.imputer.fit_transform(X_train), 
            columns=X_train.columns, 
            index=X_train.index
        )
        self.X_val = pd.DataFrame(
            self.imputer.transform(X_val), 
            columns=X_val.columns, 
            index=X_val.index
        )
        
        self.y_train = y_train
        self.y_val = y_val
        
        # 2. Define Models
        self.models = {
            "LinearRegression": LinearRegression(),
            "Ridge": Ridge(alpha=1.0),
            "Lasso": Lasso(alpha=0.1),
            # ElasticNet usually struggles with high dimensional raw data, but let's keep it
            "ElasticNet": ElasticNet(alpha=0.1),
            "DecisionTree": DecisionTreeRegressor(max_depth=5),
            "RandomForest": RandomForestRegressor(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42),
            "XGBoost": xgb.XGBRegressor(n_estimators=100, max_depth=6, n_jobs=-1, random_state=42),
            "LightGBM": lgb.LGBMRegressor(n_estimators=100, max_depth=6, n_jobs=-1, random_state=42, verbose=-1),
        }

    def evaluate(self, target_col):
        print(f"\n--- Evaluating Models for Target: {target_col} ---")
        
        y_train_target = self.y_train[target_col]
        y_val_target = self.y_val[target_col]
        
        # Dynamic weighting based on the target max life
        max_cycles = float(self.y_val[target_col].max())
        beta_weight = 2.0 if 'SV' in target_col else 1.0 
        beta_val = beta_weight / max_cycles

        target_results = []

        for name, model in tqdm(self.models.items(), desc="Models"):
            start = time.time()
            try:
                model.fit(self.X_train, y_train_target)
                preds = model.predict(self.X_val)
                
                # Competition Score
                score = np.mean(time_weighted_error(
                    y_val_target.values, 
                    preds, 
                    alpha=0.01, 
                    beta=beta_val
                ))
                
                # RMSE for sanity check
                rmse = np.sqrt(np.mean((y_val_target.values - preds)**2))
                
                target_results.append({
                    "Model": name,
                    "Target": target_col,
                    "Competition_Score": score, # Lower is better
                    "RMSE": rmse,
                    "Time_Sec": time.time() - start
                })
            except Exception as e:
                print(f"Failed {name}: {e}")

        return pd.DataFrame(target_results).sort_values(by="Competition_Score")

# --- Execution ---
# Re-instantiate the evaluator with the fix
evaluator = LazyCompetitionEvaluator(X_train, y_train, X_val, y_val)

# Run again
results_ww = evaluator.evaluate('Cycles_to_WW')
results_hpc = evaluator.evaluate('Cycles_to_HPC_SV')
results_hpt = evaluator.evaluate('Cycles_to_HPT_SV')

# Visualize Results
print("\n=== Top Models for Water Wash (WW) ===")
print(results_ww[['Model', 'Competition_Score', 'RMSE']])

print("\n=== Top Models for HPC Shop Visit ===")
print(results_hpc[['Model', 'Competition_Score', 'RMSE']])

print("\n=== Top Models for HPT Shop Visit ===")
print(results_hpt[['Model', 'Competition_Score', 'RMSE']])

Preprocessing: Imputing missing values...

--- Evaluating Models for Target: Cycles_to_WW ---


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
Models: 100%|██████████| 8/8 [00:06<00:00,  1.19it/s]



--- Evaluating Models for Target: Cycles_to_HPC_SV ---


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
Models: 100%|██████████| 8/8 [00:05<00:00,  1.38it/s]



--- Evaluating Models for Target: Cycles_to_HPT_SV ---


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
Models: 100%|██████████| 8/8 [00:04<00:00,  1.62it/s]


=== Top Models for Water Wash (WW) ===
              Model  Competition_Score         RMSE
6           XGBoost          26.296853   313.504563
7          LightGBM          29.932155   293.308482
4      DecisionTree          39.602777   302.777272
5      RandomForest          42.074734   296.221905
3        ElasticNet         534.706537  1066.573052
1             Ridge         712.414913  1781.078701
0  LinearRegression        2506.918921  3139.570743
2             Lasso        6445.482322  4372.233861

=== Top Models for HPC Shop Visit ===
              Model  Competition_Score          RMSE
5      RandomForest          13.536968    741.190331
7          LightGBM          13.866620    738.960891
6           XGBoost          16.004187    825.349787
4      DecisionTree          21.114273   1064.152211
1             Ridge         353.523798   6320.045423
2             Lasso         911.096896   8632.236589
0  LinearRegression        2674.850415  19861.489118
3        ElasticNet       626


