In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, make_scorer

# Load the training data
train_df = pd.read_csv('Trout_training.csv')
test_features_df = pd.read_csv('Trout_testing_features.csv')

# Convert Date_Time to datetime
train_df['Date_Time'] = pd.to_datetime(train_df['Date_Time'])
test_features_df['Date_Time'] = pd.to_datetime(test_features_df['Date_Time'])

# Extract temporal features
for df in [train_df, test_features_df]:
    df['year'] = df['Date_Time'].dt.year
    df['month'] = df['Date_Time'].dt.month
    df['day'] = df['Date_Time'].dt.day
    df['hour'] = df['Date_Time'].dt.hour
    df['day_of_year'] = df['Date_Time'].dt.dayofyear

# Pivot training data to create feature set
train_pivot = train_df.pivot_table(index='Date_Time', columns='Depth_m', values='Water_Temp_C').reset_index()
train_pivot.columns = [f"depth_{col}" if isinstance(col, float) else col for col in train_pivot.columns]
train_pivot = train_pivot.merge(train_df[['Date_Time', 'year', 'month', 'day', 'hour', 'day_of_year']].drop_duplicates(), on='Date_Time', how='left')

# Drop rows with missing values
train_pivot = train_pivot.dropna()

# Separate features and target variable
X = train_pivot.drop(columns=['Date_Time', 'depth_10.5'])  # Features excluding target depth
y = train_pivot['depth_10.5']  # Target

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.01, 0.05],
    'max_depth': [5, 10],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.5, 0.8],
}

# Define the XGBoost regressor
xgb_reg = xgb.XGBRegressor(random_state=42)

# Use a nested cross-validation
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)  # Inner loop for parameter tuning
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Outer loop for model evaluation

# Define the GridSearchCV with the inner cross-validation
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, scoring='neg_mean_squared_error', cv=inner_cv, n_jobs=-1, verbose=2)

# Perform the outer cross-validation with the nested structure
nested_scores = cross_val_score(grid_search, X, y, scoring='neg_mean_squared_error', cv=outer_cv, n_jobs=-1)

# Calculate mean and standard deviation of the nested scores
nested_mse_mean = -nested_scores.mean()  # Mean of MSE from cross-validation
nested_mse_std = nested_scores.std()

print(f"Nested CV MSE: Mean = {nested_mse_mean}, Std = {nested_mse_std}")

# Fit the final model on the full dataset using best parameters found
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

# Prepare the test data for predictions
test_pivot = test_features_df.pivot_table(index='Date_Time', columns='Depth_m', values='Water_Temp_C').reset_index()
test_pivot.columns = [f"depth_{col}" if isinstance(col, float) else col for col in test_pivot.columns]
test_pivot = test_pivot.merge(test_features_df[['Date_Time', 'year', 'month', 'day', 'hour', 'day_of_year']].drop_duplicates(), on='Date_Time', how='left')

# Make predictions on the test set
X_test = test_pivot.drop(columns=['Date_Time'])
predictions = best_model.predict(X_test)

# Create submission dataframe
submission_df = pd.DataFrame({
    'Date_Time': test_pivot['Date_Time'],
    'Water_Temp_C': predictions
})

# Save predictions to CSV
submission_df['Date_Time'] = pd.to_datetime(submission_df['Date_Time']).dt.strftime('%m/%d/%Y %H:%M')
submission_df.to_csv('lake_trout_predictions.csv', index=False)

print("Submission file created:", submission_df.shape)


Fitting 3 folds for each of 32 candidates, totalling 96 fits
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8; total time=   4.7s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8; total time=   4.9s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8; total time=   5.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6; total time=   5.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.8; total time=   5.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.6; total time=   5.3s