# Capstone 3: Forecasting Solar Power Using LTSM  
### _"Exploring ML Techniques for Solar Predictions"_
## Preprocessing   and   Training   Data   Development Objective: 
Goal: Create a cleaned development dataset you can use to complete the modeling step of your project.  
Steps:   
    ● Create   dummy   or   indicator   features   for   categorical   variables  
    ● Standardize   the   magnitude   of   numeric   features   using   a   scaler  
    ● Split   into   testing   and   training   datasets 

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

In [14]:
os.chdir('C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-3-Solar/Data')

df = pd.read_csv('solar_cleaned.csv')

In [6]:
df.dtypes

DATE_TIME               object
PLANT_ID                 int64
AMBIENT_TEMPERATURE    float64
MODULE_TEMPERATURE     float64
IRRADIATION            float64
SOURCE_KEY              object
DC_POWER               float64
AC_POWER               float64
DAILY_YIELD            float64
TOTAL_YIELD            float64
Inverter_Efficiency    float64
Cumulative_Yield       float64
Normalized_Yield       float64
Plant_Efficiency       float64
DATE                    object
TIME                    object
HOUR                     int64
dtype: object

In [16]:
df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

## Create indicators features for categorical variables

In [24]:
# Encode SOURCE_KEY as categorical
df['SOURCE_KEY'] = df['SOURCE_KEY'].astype('category')
df['SOURCE_KEY_CODE'] = df['SOURCE_KEY'].cat.codes  # Create numeric codes



In [26]:
source_key_map = dict(enumerate(df['SOURCE_KEY'].cat.categories))

In [28]:
import json

with open('source_key_mapping.json', 'w') as f:
    json.dump(source_key_map, f)

#save the mapping for reference

In [30]:
print(source_key_map)

{0: '1BY6WEcLGh8j5v7', 1: '1IF53ai7Xc0U56Y', 2: '3PZuoBAID5Wc2HD', 3: '4UPUqMRk7TRMgml', 4: '7JYdWkrLSPkdwr4', 5: '81aHJ1q11NBPMrL', 6: '9kRcWv60rDACzjR', 7: 'Et9kgGMDl729KT4', 8: 'IQ2d7wF4YD8zU1Q', 9: 'LYwnQax7tkwH5Cb', 10: 'LlT2YUhhzqhg5Sw', 11: 'McdE0feGgRqW7Ca', 12: 'Mx2yZCDsyf6DPfv', 13: 'NgDl19wMapZy17u', 14: 'PeE6FRyGXUgsRhN', 15: 'Qf4GUc1pJu5T6c6', 16: 'Quc1TzYxW2pYoWX', 17: 'V94E5Ben1TlhnDV', 18: 'VHMLBKoKgIrUVDU', 19: 'WRmjgnKYAwPKWDb', 20: 'WcxssY2VbP4hApt', 21: 'YxYtjZvoooNbGkE', 22: 'ZnxXDlPa8U1GXgE', 23: 'ZoEaEvLYb1n2sOq', 24: 'adLQvlD726eNBSB', 25: 'bvBOhCH3iADSZry', 26: 'iCRJl6heRkivqQ3', 27: 'ih0vzX44oOqAx2f', 28: 'mqwcsP2rE7J0TFp', 29: 'oZ35aAeoifZaQzV', 30: 'oZZkBaNadn6DNKz', 31: 'pkci93gMrogZuBj', 32: 'q49J1IKaHRwDQnt', 33: 'rGa61gmuvPhdLxV', 34: 'rrq4fwE8jgrTyWY', 35: 'sjndEbLyjtCKgGv', 36: 'uHbuxQJl8lW7ozc', 37: 'vOuJvMaM2sgwLmb', 38: 'wCURE6d3bPkepu2', 39: 'xMbIugepa2P7lBB', 40: 'xoJJ8DcxJEcupym', 41: 'z9Y9gH1T5YWrNuG', 42: 'zBIq5rxdHJRwDNY', 43: 'zVJPv84UY57bAof

## Standardize the magnitude of numeric features using a scaler 

In [33]:
scaler = MinMaxScaler()
numerical_cols = ['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 
                   'DC_POWER', 'AC_POWER', 'Inverter_Efficiency', 'Cumulative_Yield', 
                    'Normalized_Yield', 'Plant_Efficiency']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [35]:
#Handle Skewness: Apply logarithmic transformations for features like DC_POWER and AC_POWER to reduce the impact of skewed distributions.
df['LOG_DC_POWER'] = np.log1p(df['DC_POWER'])  # Avoid log(0) errors
df['LOG_AC_POWER'] = np.log1p(df['AC_POWER'])

In [49]:
df['Lag_1_DC_POWER'] = df['DC_POWER'].shift(1)  # Previous 15 minutes
df['Lag_4_DC_POWER'] = df['DC_POWER'].shift(4)  # 1 hour ago

df['Lag_1_AC_POWER'] = df['AC_POWER'].shift(1)
df['Lag_4_AC_POWER'] = df['AC_POWER'].shift(4)

In [57]:
df.isnull().sum()

DATE_TIME              0
PLANT_ID               0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
SOURCE_KEY             0
DC_POWER               0
AC_POWER               0
DAILY_YIELD            0
TOTAL_YIELD            0
Inverter_Efficiency    0
Cumulative_Yield       0
Normalized_Yield       0
Plant_Efficiency       0
DATE                   0
TIME                   0
HOUR                   0
SOURCE_KEY_CODE        0
LOG_DC_POWER           0
LOG_AC_POWER           0
Lag_1_DC_POWER         1
Lag_4_DC_POWER         4
Lag_1_AC_POWER         1
Lag_4_AC_POWER         4
dtype: int64

Drop missing values since they are direct result of lagging. Also important to drop since time-series models rely on complete sequences for analysis. 

In [60]:
df.dropna(subset=['Lag_1_DC_POWER', 'Lag_4_DC_POWER', 'Lag_1_AC_POWER', 'Lag_4_AC_POWER'], inplace=True)

## Split   into   testing   and   training   datasets 

In [66]:
df.sort_values(by='DATE_TIME', inplace=True)

In [68]:
train_size = int(len(df) * 0.8)
train, test = df.iloc[:train_size], df.iloc[train_size:]

In [70]:
df.sort_index(inplace=True)

# 80% training, 20% testing
train_size = int(len(df) * 0.8)

# Split into train and test sets
train = df.iloc[:train_size]
test = df.iloc[train_size:]

# Inspect the split
print("Training data:")
print(train.shape)
print("Testing data:")
print(test.shape)

Training data:
(109174, 24)
Testing data:
(27294, 24)


### Save data

In [72]:
datapath = 'C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-3-Solar/data'
solar_process = 'solar_preprocess.csv'
filepath= os.path.join(datapath, solar_process)

df.to_csv(filepath, index= False)
print(f"Data saved successfully to '{filepath}'")

# Save training data
train.to_csv('train_data.csv', index=False)

# Save testing data
test.to_csv('test_data.csv', index=False)

print("Training and testing datasets saved successfully!")

Data saved successfully to 'C:/Users/aamal/Desktop/Springboard/Springboard_DataScience/Capstone-3-Solar/data\solar_preprocess.csv'
Training and testing datasets saved successfully!
