In [8]:

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
import optuna
from optuna.integration import XGBoostPruningCallback, LightGBMPruningCallback
from sklearn.feature_selection import SelectFromModel
import logging
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 1. Data Loading with Robust Type Conversion and Outlier Handling
def load_data():
    """Load and prepare data with proper type conversion, feature engineering, and outlier handling"""
    try:
        train_data = pd.read_csv('dataset/train.csv')
        test_data = pd.read_csv('dataset/test.csv')
        logging.info("Data loaded successfully")
        
        numeric_cols = ['temperature', 'irradiance', 'humidity', 'panel_age', 
                        'maintenance_count', 'soiling_ratio', 'voltage', 'current',
                        'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure']
        categorical_cols = ['string_id', 'error_code', 'installation_type']
        
        # Convert numeric columns and handle outliers
        for col in numeric_cols:
            train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
            test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
            train_data[col] = train_data[col].replace([np.inf, -np.inf], np.nan)
            test_data[col] = test_data[col].replace([np.inf, -np.inf], np.nan)
            train_data[col] = np.where(train_data[col] < 0, np.nan, train_data[col])
            test_data[col] = np.where(test_data[col] < 0, np.nan, test_data[col])
            # Clip outliers at 1st and 99th percentiles
            p1, p99 = train_data[col].quantile([0.01, 0.99]).values
            train_data[col] = train_data[col].clip(p1, p99)
            test_data[col] = test_data[col].clip(p1, p99)
        
        # Advanced feature engineering
        for data in [train_data, test_data]:
            data['power_output'] = data['voltage'] * data['current']
            data['temp_adjusted_irradiance'] = data['irradiance'] / (1 + 0.005 * (data['module_temperature'].fillna(25) - 25))
            data['soiling_impact'] = data['soiling_ratio'] * data['irradiance']
            data['age_maintenance_ratio'] = data['panel_age'] / (data['maintenance_count'].fillna(0) + 1)
            data['log_irradiance'] = np.log1p(data['irradiance'].fillna(0))
            data['temp_diff'] = data['module_temperature'] - data['temperature']
            data['irradiance_per_age'] = data['irradiance'] / (data['panel_age'].fillna(1) + 1)
            data['wind_cooling'] = data['wind_speed'] * (data['module_temperature'].fillna(25) - data['temperature'].fillna(25))
            data['humidity_pressure_ratio'] = data['humidity'] / (data['pressure'].fillna(1000) + 1)
            data['cloud_irradiance_interaction'] = data['cloud_coverage'] * data['irradiance']
        
        logging.info("Feature engineering completed")
        logging.info(f"NaN counts in train_data: {train_data[numeric_cols + ['power_output', 'temp_adjusted_irradiance', 'soiling_impact', 'age_maintenance_ratio', 'log_irradiance', 'temp_diff', 'irradiance_per_age', 'wind_cooling', 'humidity_pressure_ratio', 'cloud_irradiance_interaction']].isna().sum()}")
        
        X = train_data.drop(columns=['id', 'efficiency'])
        y = train_data['efficiency']
        test_ids = test_data['id']
        X_test = test_data.drop(columns=['id'])
        
        # Verify target variable
        logging.info(f"Target variable dtype: {y.dtype}, NaN count: {y.isna().sum()}")
        if not np.issubdtype(y.dtype, np.number):
            raise ValueError("Target variable 'efficiency' must be numeric")
        
        return X, y, X_test, test_ids
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

# 2. Enhanced Preprocessing with OrdinalEncoder
def get_preprocessor():
    """Create robust preprocessing pipeline with advanced imputation and encoding"""
    numeric_features = ['temperature', 'irradiance', 'humidity', 'panel_age', 
                        'maintenance_count', 'soiling_ratio', 'voltage', 'current',
                        'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure',
                        'power_output', 'temp_adjusted_irradiance', 'soiling_impact',
                        'age_maintenance_ratio', 'log_irradiance', 'temp_diff',
                        'irradiance_per_age', 'wind_cooling', 'humidity_pressure_ratio',
                        'cloud_irradiance_interaction']
    
    categorical_features = ['string_id', 'error_code', 'installation_type']
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', RobustScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ordinal_enc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
        remainder='drop')
    
    return preprocessor, numeric_features

# 3. Hyperparameter Tuning with Optuna
def objective(trial, X, y, preprocessor):
    """Objective function for Optuna hyperparameter optimization with pruning"""
    params_xgb = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('xgb_lr', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 12),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample', 0.5, 1.0),
        'min_child_weight': trial.suggest_float('xgb_min_child_weight', 1, 10),
        'gamma': trial.suggest_float('xgb_gamma', 0, 5)
    }
    params_lgbm = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('lgbm_lr', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('lgbm_max_depth', 3, 12),
        'subsample': trial.suggest_float('lgbm_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('lgbm_colsample', 0.5, 1.0),
        'num_leaves': trial.suggest_int('lgbm_num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('lgbm_min_child_samples', 10, 50)
    }
    params_rf = {
        'n_estimators': trial.suggest_int('rf_n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('rf_max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 10)
    }
    
    xgb = XGBRegressor(**params_xgb, random_state=42, early_stopping_rounds=50, eval_metric='rmse')
    lgbm = LGBMRegressor(**params_lgbm, random_state=42, early_stopping_rounds=50)
    rf = RandomForestRegressor(**params_rf, random_state=42)
    
    xgb_pruner = XGBoostPruningCallback(trial, 'validation_0-rmse')
    lgbm_pruner = LightGBMPruningCallback(trial, 'rmse')
    
    ensemble = VotingRegressor([('xgb', xgb), ('lgbm', lgbm), ('rf', rf)])
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in kf.split(X, pd.qcut(y, q=5, duplicates='drop')):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', ensemble)
        ])
        
        pipeline.fit(
            X_train, y_train,
            regressor__xgb__eval_set=[(X_val, y_val)],
            regressor__lgbm__eval_set=[(X_val, y_val)],
            regressor__xgb__callbacks=[xgb_pruner],
            regressor__lgbm__callbacks=[lgbm_pruner],
            regressor__xgb__verbose=False,
            regressor__lgbm__verbose=-1
        )
        
        val_preds = pipeline.predict(X_val)
        score = 100 * (1 - np.sqrt(mean_squared_error(y_val, val_preds)))
        scores.append(score)
    
    return np.mean(scores)

# 4. Model Training with Cross-Validation, Feature Selection, and Ensemble
def train_and_evaluate(X, y, preprocessor, numeric_features):
    """Train ensemble model with cross-validation and feature selection"""
    try:
        study = optuna.create_study(
            direction='maximize',
            storage='sqlite:///optuna_study.db',
            study_name='solar_panel_efficiency',
            load_if_exists=True
        )
        
        study.optimize(
            lambda trial: objective(trial, X, y, preprocessor),
            n_trials=30,
            n_jobs=-1,
            timeout=7200
        )
        
        best_params = study.best_params
        logging.info(f"Best hyperparameters: {best_params}")
        logging.info(f"Best CV score: {study.best_value:.2f}")
        
        xgb = XGBRegressor(
            n_estimators=best_params['xgb_n_estimators'],
            learning_rate=best_params['xgb_lr'],
            max_depth=best_params['xgb_max_depth'],
            subsample=best_params['xgb_subsample'],
            colsample_bytree=best_params['xgb_colsample'],
            min_child_weight=best_params['xgb_min_child_weight'],
            gamma=best_params['xgb_gamma'],
            random_state=42,
            early_stopping_rounds=50
        )
        lgbm = LGBMRegressor(
            n_estimators=best_params['lgbm_n_estimators'],
            learning_rate=best_params['lgbm_lr'],
            max_depth=best_params['lgbm_max_depth'],
            subsample=best_params['lgbm_subsample'],
            colsample_bytree=best_params['lgbm_colsample'],
            num_leaves=best_params['lgbm_num_leaves'],
            min_child_samples=best_params['lgbm_min_child_samples'],
            random_state=42,
            early_stopping_rounds=50
        )
        rf = RandomForestRegressor(
            n_estimators=best_params['rf_n_estimators'],
            max_depth=best_params['rf_max_depth'],
            min_samples_split=best_params['rf_min_samples_split'],
            min_samples_leaf=best_params['rf_min_samples_leaf'],
            random_state=42
        )
        
        ensemble = VotingRegressor([('xgb', xgb), ('lgbm', lgbm), ('rf', rf)])
        
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('selector', SelectFromModel(XGBRegressor(random_state=42), max_features=15)),
            ('regressor', ensemble)
        ])
        
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores, mae_scores, r2_scores = [], [], []
        
        for train_idx, val_idx in kf.split(X, pd.qcut(y, q=5, duplicates='drop')):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            full_pipeline.fit(
                X_train, y_train,
                regressor__xgb__eval_set=[(X_val, y_val)],
                regressor__lgbm__eval_set=[(X_val, y_val)],
                regressor__xgb__verbose=False,
                regressor__lgbm__verbose=-1
            )
            
            val_preds = full_pipeline.predict(X_val)
            score = 100 * (1 - np.sqrt(mean_squared_error(y_val, val_preds)))
            mae = mean_absolute_error(y_val, val_preds)
            r2 = r2_score(y_val, val_preds)
            scores.append(score)
            mae_scores.append(mae)
            r2_scores.append(r2)
        
        logging.info(f"Cross-Validation Scores: {scores}")
        logging.info(f"Mean CV Score: {np.mean(scores):.2f} ± {np.std(scores):.2f}")
        logging.info(f"Mean MAE: {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
        logging.info(f"Mean R²: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")
        
        full_pipeline.fit(X, y)
        
        feature_importance = full_pipeline.named_steps['selector'].estimator_.feature_importances_
        feature_names = numeric_features + categorical_features
        importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
        logging.info(f"Feature Importance:\n{importance_df.sort_values(by='Importance', ascending=False)}")
        
        return full_pipeline
        
    except Exception as e:
        logging.error(f"Error during training: {str(e)}")
        raise

# 5. Main Execution Flow
def main():
    try:
        X, y, X_test, test_ids = load_data()
        preprocessor, numeric_features = get_preprocessor()
        model = train_and_evaluate(X, y, preprocessor, numeric_features)
        test_preds = model.predict(X_test)
        test_preds = np.clip(test_preds, 0, 1)
        
        submission = pd.DataFrame({'id': test_ids, 'efficiency': test_preds})
        if submission.shape == (12000, 2) and list(submission.columns) == ['id', 'efficiency']:
            submission.to_csv('submission.csv', index=False)
            logging.info("Submission file created successfully!")
        else:
            logging.error("Invalid submission format")
            raise ValueError("Submission file does not meet required format (12000 x 2 with columns 'id', 'efficiency')")
            
    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")
        raise

main()

2025-06-04 20:40:23,643 - INFO - Data loaded successfully
2025-06-04 20:40:23,717 - INFO - Feature engineering completed
2025-06-04 20:40:23,721 - INFO - NaN counts in train_data: temperature                     1001
irradiance                      1411
humidity                         127
panel_age                       1011
maintenance_count               1027
soiling_ratio                   1010
voltage                          993
current                          977
module_temperature               978
cloud_coverage                  1010
wind_speed                       119
pressure                         135
power_output                    1925
temp_adjusted_irradiance        1411
soiling_impact                  2363
age_maintenance_ratio           1011
log_irradiance                     0
temp_diff                       1947
irradiance_per_age              1411
wind_cooling                     119
humidity_pressure_ratio          127
cloud_irradiance_interaction    2333
dtype:

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.