In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import RobustScaler

# --- 1. THE PHYSICS ENHANCER ---
def enrich_physics(df):
    df['Sample Date'] = pd.to_datetime(df['Sample Date'], dayfirst=True)
    
    # A: Seasonal Physics (Cyclical)
    df['season_proxy'] = np.cos(2 * np.pi * (df['Sample Date'].dt.dayofyear - 15) / 365)
    
    # B: Concentration Physics (Evaporation vs Water)
    # High PET and low MNDWI = concentrated salts (High EC/TA)
    df['evap_stress'] = df['pet'] / (df['MNDWI'] + 2)
    
    # C: Spectral Physics
    df['turbidity_proxy'] = df['green'] / (df['swir22'] + 1)
    df['algae_proxy'] = (df['nir'] - df['green']) / (df['nir'] + df['green'] + 1e-5)
    
    return df

# --- 2. PREPARE DATA ---
train_raw = pd.read_csv('water_data.csv')
l_val = pd.read_csv('landsat_features_validation.csv')
t_val = pd.read_csv('terraclimate_features_validation.csv')

# Syncing validation
val_raw = pd.merge(l_val, t_val[['Longitude', 'Latitude', 'Sample Date', 'pet']], 
                   on=['Longitude', 'Latitude', 'Sample Date'], how='left').fillna(method='ffill')

train = enrich_physics(train_raw)
val = enrich_physics(val_raw)

features = ['Latitude', 'Longitude', 'season_proxy', 'evap_stress', 'turbidity_proxy', 
            'algae_proxy', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI', 'pet']

X = train[features]
y = train[['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']]
X_val = val[features]

# --- 3. THE ENSEMBLE (Stacking for 0.99+) ---
# Using ExtraTrees for spatial robustness + HistGradient for spectral precision
model1 = MultiOutputRegressor(HistGradientBoostingRegressor(max_iter=1200, learning_rate=0.02, random_state=42))
model2 = MultiOutputRegressor(ExtraTreesRegressor(n_estimators=500, max_depth=15, random_state=42))

print("Training Ensemble...")
model1.fit(X, y)
model2.fit(X, y)

# Weighted average of the two models (Ensemble)
preds_val = (model1.predict(X_val) * 0.6) + (model2.predict(X_val) * 0.4)

# --- 4. SUBMISSION ---
submission_df = pd.DataFrame({
    'Longitude': l_val['Longitude'],
    'Latitude': l_val['Latitude'],
    'Sample Date': l_val['Sample Date'],
    'Total Alkalinity': preds_val[:, 0],
    'Electrical Conductance': preds_val[:, 1],
    'Dissolved Reactive Phosphorus': preds_val[:, 2]
})

submission_df.to_csv("submission.csv", index=False)
print("Ensemble Submission Ready!")

  on=['Longitude', 'Latitude', 'Sample Date'], how='left').fillna(method='ffill')


Training Ensemble...
Ensemble Submission Ready!
