In [3]:
# Data Preparation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
train_data = pd.read_csv('dataset/train.csv')
test_data = pd.read_csv('dataset/test.csv')

# Separate features and target
X = train_data.drop(columns=['id', 'efficiency'])
y = train_data['efficiency']
test_ids = test_data['id']
X_test = test_data.drop(columns=['id'])

# Identify numerical and categorical columns
numeric_features = ['temperature', 'irradiance', 'humidity', 'panel_age', 
                   'maintenance_count', 'soiling_ratio', 'voltage', 'current',
                   'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure']

categorical_features = ['string_id', 'error_code', 'installation_type']

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# XGBoost model

from xgboost import XGBRegressor

# Create pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', XGBRegressor(n_estimators=1000, 
                                                learning_rate=0.05, 
                                                early_stopping_rounds=50,
                                                eval_metric='rmse',
                                                random_state=42))])

# Fit the model
model.fit(X_train, y_train,
         regressor__eval_set=[(preprocessor.transform(X_val), y_val)],
         regressor__verbose=False)

# Evaluate
val_preds = model.predict(X_val)
score = 100 * (1 - np.sqrt(mean_squared_error(y_val, val_preds)))
print(f"Validation Score: {score:.2f}")

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# Hyperparameter Tuning

from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_grid = {
    'regressor__n_estimators': [500, 1000, 1500],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}

# Randomized search
search = RandomizedSearchCV(model, param_grid, n_iter=20, 
                           scoring='neg_root_mean_squared_error',
                           cv=3, verbose=1, random_state=42)
search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
best_model = search.best_estimator_