# Solar Power Forecasting - Full Pipeline

태양광 발전량 예측 전체 파이프라인

1. 데이터 로드 및 병합
2. 데이터 전처리
3. 피처 엔지니어링
4. 모델 학습 (XGBoost)
5. 평가 및 시각화
6. 모델 저장

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

## 1. 데이터 로드 및 병합

In [None]:
# Plant 1 데이터 로드
gen1 = pd.read_csv('../data/Plant_1_Generation_Data.csv')
weather1 = pd.read_csv('../data/Plant_1_Weather_Sensor_Data.csv')

# Plant 2 데이터 로드
gen2 = pd.read_csv('../data/Plant_2_Generation_Data.csv')
weather2 = pd.read_csv('../data/Plant_2_Weather_Sensor_Data.csv')

print(f"Plant 1 - Generation: {gen1.shape}, Weather: {weather1.shape}")
print(f"Plant 2 - Generation: {gen2.shape}, Weather: {weather2.shape}")

In [None]:
# 날짜/시간 변환
gen1['DATE_TIME'] = pd.to_datetime(gen1['DATE_TIME'], dayfirst=True)
gen2['DATE_TIME'] = pd.to_datetime(gen2['DATE_TIME'], dayfirst=True)
weather1['DATE_TIME'] = pd.to_datetime(weather1['DATE_TIME'])
weather2['DATE_TIME'] = pd.to_datetime(weather2['DATE_TIME'])

print("Date range:")
print(f"Gen1: {gen1['DATE_TIME'].min()} ~ {gen1['DATE_TIME'].max()}")
print(f"Weather1: {weather1['DATE_TIME'].min()} ~ {weather1['DATE_TIME'].max()}")

In [None]:
# 인버터별 발전량을 시간대별로 집계 (평균)
gen1_agg = gen1.groupby('DATE_TIME').agg({
    'DC_POWER': 'sum',
    'AC_POWER': 'sum',
    'DAILY_YIELD': 'sum',
    'TOTAL_YIELD': 'sum'
}).reset_index()

gen2_agg = gen2.groupby('DATE_TIME').agg({
    'DC_POWER': 'sum',
    'AC_POWER': 'sum',
    'DAILY_YIELD': 'sum',
    'TOTAL_YIELD': 'sum'
}).reset_index()

print(f"Aggregated - Gen1: {gen1_agg.shape}, Gen2: {gen2_agg.shape}")

In [None]:
# 발전 데이터 + 날씨 데이터 병합
df1 = pd.merge(gen1_agg, weather1[['DATE_TIME', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']], 
               on='DATE_TIME', how='inner')
df1['PLANT_ID'] = 1

df2 = pd.merge(gen2_agg, weather2[['DATE_TIME', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']], 
               on='DATE_TIME', how='inner')
df2['PLANT_ID'] = 2

# 두 플랜트 데이터 합치기
df = pd.concat([df1, df2], ignore_index=True)
df = df.sort_values('DATE_TIME').reset_index(drop=True)

print(f"Combined dataset: {df.shape}")
df.head()

## 2. 데이터 전처리

In [None]:
# 결측치 확인
print("Missing values:")
print(df.isnull().sum())
print(f"\nTotal rows: {len(df)}")

In [None]:
# 결측치 처리 (있는 경우)
df = df.dropna()
print(f"After dropping NA: {df.shape}")

In [None]:
# 이상치 확인 - DC_POWER 음수값 제거
print(f"Negative DC_POWER count: {(df['DC_POWER'] < 0).sum()}")
df = df[df['DC_POWER'] >= 0]
print(f"After removing negatives: {df.shape}")

In [None]:
# 기본 통계
df.describe()

## 3. 피처 엔지니어링

In [None]:
# 시간 관련 피처
df['hour'] = df['DATE_TIME'].dt.hour
df['minute'] = df['DATE_TIME'].dt.minute
df['day'] = df['DATE_TIME'].dt.day
df['month'] = df['DATE_TIME'].dt.month
df['dayofweek'] = df['DATE_TIME'].dt.dayofweek  # 0=Monday, 6=Sunday
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

# 시간대 (15분 단위를 하루 중 몇 번째 구간인지)
df['time_slot'] = df['hour'] * 4 + df['minute'] // 15

print("Time features added:")
df[['DATE_TIME', 'hour', 'minute', 'day', 'month', 'dayofweek', 'is_weekend', 'time_slot']].head(10)

In [None]:
# 순환 인코딩 (시간, 요일)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

print("Cyclical features added")

In [None]:
# 온도 차이 (모듈 온도 - 주변 온도)
df['temp_diff'] = df['MODULE_TEMPERATURE'] - df['AMBIENT_TEMPERATURE']

# 일사량 관련 피처
df['irr_squared'] = df['IRRADIATION'] ** 2  # 비선형 관계 포착

print("Additional features added")
df.head()

In [None]:
# 최종 피처 확인
print(f"Final dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

## 4. 데이터 분할 및 모델 학습

In [None]:
# 피처 선택
feature_cols = [
    'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION',
    'hour', 'dayofweek', 'is_weekend', 'time_slot',
    'hour_sin', 'hour_cos', 'dayofweek_sin', 'dayofweek_cos',
    'temp_diff', 'irr_squared', 'PLANT_ID'
]

target_col = 'DC_POWER'

X = df[feature_cols]
y = df[target_col]

print(f"Features: {X.shape}")
print(f"Target: {y.shape}")

In [None]:
# Train/Test 분할 (시계열이므로 shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# XGBoost 모델 학습
model = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=50
)

print("\nModel training completed!")

## 5. 모델 평가

In [None]:
# 예측
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 평가 지표
def evaluate_model(y_true, y_pred, dataset_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{dataset_name} Performance:")
    print(f"  RMSE: {rmse:,.2f}")
    print(f"  MAE:  {mae:,.2f}")
    print(f"  R2:   {r2:.4f}")
    
    return {'rmse': rmse, 'mae': mae, 'r2': r2}

train_metrics = evaluate_model(y_train, y_train_pred, "Train")
test_metrics = evaluate_model(y_test, y_test_pred, "Test")

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nTop 5 Features:")
print(feature_importance.head())

In [None]:
# 실제 vs 예측 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_test, y_test_pred, alpha=0.3, s=10)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual DC Power')
axes[0].set_ylabel('Predicted DC Power')
axes[0].set_title('Actual vs Predicted (Test Set)')

# Time series comparison (last 500 points)
n_points = 500
axes[1].plot(range(n_points), y_test.values[-n_points:], label='Actual', alpha=0.7)
axes[1].plot(range(n_points), y_test_pred[-n_points:], label='Predicted', alpha=0.7)
axes[1].set_xlabel('Time Index')
axes[1].set_ylabel('DC Power')
axes[1].set_title('Actual vs Predicted (Last 500 points)')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# 잔차 분석
residuals = y_test - y_test_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residual distribution
axes[0].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='r', linestyle='--')
axes[0].set_xlabel('Residual')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Residual Distribution')

# Residual vs Predicted
axes[1].scatter(y_test_pred, residuals, alpha=0.3, s=10)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted DC Power')
axes[1].set_ylabel('Residual')
axes[1].set_title('Residuals vs Predicted')

plt.tight_layout()
plt.show()

## 6. 모델 저장

In [None]:
import os

# models 폴더 생성
os.makedirs('../models', exist_ok=True)

# 모델 저장
model_path = '../models/xgboost_solar_power.joblib'
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

# 피처 정보 저장
feature_info = {
    'feature_cols': feature_cols,
    'target_col': target_col,
    'train_metrics': train_metrics,
    'test_metrics': test_metrics
}
joblib.dump(feature_info, '../models/feature_info.joblib')
print("Feature info saved!")

## 결과 요약

### 모델 성능
- **RMSE**: Root Mean Squared Error
- **MAE**: Mean Absolute Error  
- **R2**: 결정계수 (1에 가까울수록 좋음)

### 주요 피처
1. IRRADIATION (일사량) - 가장 중요
2. MODULE_TEMPERATURE (모듈 온도)
3. hour (시간)
4. time_slot (시간대)

### 다음 단계
- 하이퍼파라미터 튜닝
- 다른 모델 비교 (LightGBM, LSTM 등)
- 실시간 예측 API 구축

In [None]:
# 최종 결과 출력
print("="*50)
print("Solar Power Forecasting Model - Summary")
print("="*50)
print(f"\nDataset: {len(df):,} samples")
print(f"Features: {len(feature_cols)}")
print(f"Model: XGBoost Regressor")
print(f"\nTest Performance:")
print(f"  R2 Score: {test_metrics['r2']:.4f}")
print(f"  RMSE: {test_metrics['rmse']:,.2f} kW")
print(f"  MAE: {test_metrics['mae']:,.2f} kW")
print("\n" + "="*50)