<a href="https://colab.research.google.com/github/archashikamal/solar-panel-efficiency-ml/blob/main/Performance__optimization__of__solar_panels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. Imports and Setup ---
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import randint

# --- 2. Load Data ---
train_path = '/content/train.csv'
test_path  = '/content/test.csv'

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

# --- 3. Feature Engineering & Data Cleaning ---
def preprocess_data(df):
    numerical_cols = ['voltage', 'current', 'temperature', 'humidity']
    for col in numerical_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['Power'] = df['voltage'] * df['current']
    df['error_code'] = df['error_code'].astype(str).fillna('Normal')
    return df

train = preprocess_data(train)
test = preprocess_data(test)

# --- 4. Define Features and Target ---
numerical_features = ['voltage', 'current', 'Power', 'temperature', 'humidity']
categorical_features = ['error_code']
all_features = numerical_features + categorical_features
target = 'efficiency'

train_reg = train.dropna(subset=[target] + numerical_features)
X = train_reg[all_features]
y = train_reg[target]

# --- 5. Preprocessing Pipeline ---
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# --- 6. Custom Score Function ---
def custom_score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return 100 * (1 - np.sqrt(mse))

# --- 7. Hyperparameter Tuning (Colab-optimized) ---
model = RandomForestRegressor(random_state=42)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Smaller parameter grid for Colab
param_grid = {
    'model__n_estimators': randint(100, 200),  # Random values between 100 and 200
    'model__max_depth': [None, 10],
    'model__max_features': ['sqrt'],
    'model__min_samples_split': randint(2, 5),
    'model__min_samples_leaf': randint(1, 3),  # Increased range slightly for variation
    'model__bootstrap': [True]
}

grid_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    cv=3,              # Fewer folds for speed/memory
    scoring='neg_mean_squared_error',
    n_jobs=2,          # Limit parallel jobs for Colab
    verbose=2
)

print("Starting hyperparameter tuning...")
grid_search.fit(X, y)
print("Best parameters found:", grid_search.best_params_)
print("Best CV MSE:", -grid_search.best_score_)

best_pipeline = grid_search.best_estimator_

# --- 8. Cross-Validation with Best Model ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores, rmse_scores, r2_scores, score_scores = [], [], [], []

print("\nEfficiency Prediction Results (5-Fold CV):")
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_pred)
    score = custom_score(y_val, y_pred)

    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r2)
    score_scores.append(score)

    print(f"  Fold {fold+1}: MSE={mse:.6f}, RMSE={rmse:.6f}, R2={r2:.6f}, Score={score:.6f}")

print("\nAverage Results:")
print(f"  MSE:   {np.mean(mse_scores):.6f}")
print(f"  RMSE:  {np.mean(rmse_scores):.6f}")
print(f"  R2:    {np.mean(r2_scores):.6f}")
print(f"  Score: {np.mean(score_scores):.6f}")

# --- 9. Train on Full Data and Predict Test Set ---
best_pipeline.fit(X, y)
X_test = test[all_features]
efficiency_pred = best_pipeline.predict(X_test)

# --- 10. Prepare Submission File ---
submission = pd.DataFrame({
    'id': test['id'],
    'efficiency': efficiency_pred
})

assert submission.shape == (12000, 2), f"Expected 12000x2, got {submission.shape}"
submission.to_csv('submission_final.csv', index=False)
print("\nSubmission file 'submission_final.csv' created!")


Starting hyperparameter tuning...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters found: {'model__bootstrap': True, 'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 105}
Best CV MSE: 0.018071750885422216

Efficiency Prediction Results (5-Fold CV):
  Fold 1: MSE=0.017728, RMSE=0.133145, R2=0.085843, Score=86.685455
  Fold 2: MSE=0.018412, RMSE=0.135689, R2=0.077407, Score=86.431099
  Fold 3: MSE=0.019075, RMSE=0.138114, R2=0.081500, Score=86.188630
  Fold 4: MSE=0.017819, RMSE=0.133488, R2=0.089638, Score=86.651193
  Fold 5: MSE=0.017290, RMSE=0.131493, R2=0.111160, Score=86.850672

Average Results:
  MSE:   0.018065
  RMSE:  0.134386
  R2:    0.089110
  Score: 86.561410

Submission file 'submission_final.csv' created!


In [None]:
from google.colab import files
files.download('submission_final.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>