Solar Power Output Prediction

Predict solar power output (DC Power/ AC Power ) using weather features such as irradiation , ambient temperature , etc.
This helps with efficient forecasting and grid management .

Dataset:Solar Power Generation Data
Source : Kaggle 
https://www.kaggle.com/datasets/anikannal/solar-power-generation-data/data

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


gen_df = pd.read_csv("Plant_1_Generation_Data.csv")
weather_df = pd.read_csv("Plant_1_Weather_Sensor_Data.csv")

# Quick look
print("Generation Data:")
print(gen_df.head())
print(gen_df.info())

print("\nWeather Data:")
print(weather_df.head())
print(weather_df.info())



#  Convert DATE_TIME to datetime
gen_df['DATE_TIME'] = pd.to_datetime(gen_df['DATE_TIME'])
weather_df['DATE_TIME'] = pd.to_datetime(weather_df['DATE_TIME'])

# Remove duplicates
gen_df = gen_df.drop_duplicates()
weather_df = weather_df.drop_duplicates()

# Fill missing values
numeric_gen_cols = ['DC_POWER', 'AC_POWER', 'IRRADIATION', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE']
numeric_weather_cols = ['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']  # adjust as needed

gen_df[numeric_gen_cols] = gen_df[numeric_gen_cols].fillna(gen_df[numeric_gen_cols].median())
weather_df[numeric_weather_cols] = weather_df[numeric_weather_cols].fillna(weather_df[numeric_weather_cols].median())

# Correct negative power values
gen_df['DC_POWER'] = gen_df['DC_POWER'].apply(lambda x: max(x, 0))
gen_df['AC_POWER'] = gen_df['AC_POWER'].apply(lambda x: max(x, 0))


# Merge datasets  on DATE_TIME and SOURCE_KEY 
if 'SOURCE_KEY' in gen_df.columns and 'SOURCE_KEY' in weather_df.columns:
    combined_df = pd.merge(gen_df, weather_df, on=['DATE_TIME', 'SOURCE_KEY'], how='inner')
else:
    combined_df = pd.merge(gen_df, weather_df, on='DATE_TIME', how='inner')

print("\nCombined Dataset:")
print(combined_df.head())
print(combined_df.info())

features = ['IRRADIATION', 'AMBIENT_TEMPERATURE_x', 'MODULE_TEMPERATURE_x', 'AMBIENT_TEMPERATURE_y', 'MODULE_TEMPERATURE_y']
target = 'DC_POWER'

# Test / Train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

