## ☀️ Solar Radiation Regression 

Given *solar data from different time periods*, let's try to predict the **solar radiation** of a given period.

We will use XGBoost to make our predictions. 

Data source: https://www.kaggle.com/datasets/dronio/SolarEnergy

### Importing Libraries

In [8]:
import numpy as np
import pandas as pd

import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import optuna
import xgboost as xgb

from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('SolarPrediction.csv')
data

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475229326,9/29/2016 12:00:00 AM,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00
1,1475229023,9/29/2016 12:00:00 AM,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00
2,1475228726,9/29/2016 12:00:00 AM,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00
3,1475228421,9/29/2016 12:00:00 AM,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00
4,1475228124,9/29/2016 12:00:00 AM,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00
...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,12/1/2016 12:00:00 AM,00:20:04,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00
32682,1480587301,12/1/2016 12:00:00 AM,00:15:01,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00
32683,1480587001,12/1/2016 12:00:00 AM,00:10:01,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00
32684,1480586702,12/1/2016 12:00:00 AM,00:05:02,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32686 entries, 0 to 32685
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                32686 non-null  int64  
 1   Data                    32686 non-null  object 
 2   Time                    32686 non-null  object 
 3   Radiation               32686 non-null  float64
 4   Temperature             32686 non-null  int64  
 5   Pressure                32686 non-null  float64
 6   Humidity                32686 non-null  int64  
 7   WindDirection(Degrees)  32686 non-null  float64
 8   Speed                   32686 non-null  float64
 9   TimeSunRise             32686 non-null  object 
 10  TimeSunSet              32686 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 2.7+ MB


In [5]:
print("Total missing values: ", data.isna().sum().sum())

Total missing values:  0


### Feature Engineering

In [6]:
df = data.copy()

In [7]:
df

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475229326,9/29/2016 12:00:00 AM,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00
1,1475229023,9/29/2016 12:00:00 AM,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00
2,1475228726,9/29/2016 12:00:00 AM,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00
3,1475228421,9/29/2016 12:00:00 AM,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00
4,1475228124,9/29/2016 12:00:00 AM,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00
...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,12/1/2016 12:00:00 AM,00:20:04,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00
32682,1480587301,12/1/2016 12:00:00 AM,00:15:01,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00
32683,1480587001,12/1/2016 12:00:00 AM,00:10:01,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00
32684,1480586702,12/1/2016 12:00:00 AM,00:05:02,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00


In [14]:
df['Month'] = df['Data'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(int)
df['Day'] = df['Data'].apply(lambda x: re.search(r'(?<=\/)\d+(?=\/)', x).group(0)).astype(int)
df['Year'] = df['Data'].apply(lambda x: re.search(r'(?<=\/)\d+(?=\s)', x).group(0)).astype(int)
df = df.drop('Data', axis=1)
df

Unnamed: 0,UNIXTime,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,Month,Day,Year
0,1475229326,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00,9,29,2016
1,1475229023,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00,9,29,2016
2,1475228726,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00,9,29,2016
3,1475228421,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00,9,29,2016
4,1475228124,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00,9,29,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,00:20:04,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00,12,1,2016
32682,1480587301,00:15:01,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00,12,1,2016
32683,1480587001,00:10:01,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00,12,1,2016
32684,1480586702,00:05:02,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00,12,1,2016


In [15]:
df['Hour'] = df['Time'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(int)
df['Minute'] = df['Time'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(int)
df['Second'] = df['Time'].apply(lambda x: re.search(r'\d+$', x).group(0)).astype(int)
df = df.drop('Time', axis=1)
df

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,Month,Day,Year,Hour,Minute,Second
0,1475229326,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00,9,29,2016,23,55,26
1,1475229023,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00,9,29,2016,23,50,23
2,1475228726,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00,9,29,2016,23,45,26
3,1475228421,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00,9,29,2016,23,40,21
4,1475228124,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00,9,29,2016,23,35,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00,12,1,2016,0,20,4
32682,1480587301,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00,12,1,2016,0,15,1
32683,1480587001,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00,12,1,2016,0,10,1
32684,1480586702,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00,12,1,2016,0,5,2


In [16]:
df['Sunrise_Hour'] = df['TimeSunRise'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(int)
df['Sunrise_Minute'] = df['TimeSunRise'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(int)

df['Sunset_Hour'] = df['TimeSunSet'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(int)
df['Sunset_Minute'] = df['TimeSunSet'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(int)

df = df.drop(['TimeSunRise', 'TimeSunSet'], axis=1)
df

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Month,Day,Year,Hour,Minute,Second,Sunrise_Hour,Sunrise_Minute,Sunset_Hour,Sunset_Minute
0,1475229326,1.21,48,30.46,59,177.39,5.62,9,29,2016,23,55,26,6,13,18,13
1,1475229023,1.21,48,30.46,58,176.78,3.37,9,29,2016,23,50,23,6,13,18,13
2,1475228726,1.23,48,30.46,57,158.75,3.37,9,29,2016,23,45,26,6,13,18,13
3,1475228421,1.21,48,30.46,60,137.71,3.37,9,29,2016,23,40,21,6,13,18,13
4,1475228124,1.17,48,30.46,62,104.95,5.62,9,29,2016,23,35,24,6,13,18,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,1.22,44,30.43,102,145.42,6.75,12,1,2016,0,20,4,6,41,17,42
32682,1480587301,1.17,44,30.42,102,117.78,6.75,12,1,2016,0,15,1,6,41,17,42
32683,1480587001,1.20,44,30.42,102,145.19,9.00,12,1,2016,0,10,1,6,41,17,42
32684,1480586702,1.23,44,30.42,101,164.19,7.87,12,1,2016,0,5,2,6,41,17,42


In [17]:
df.dtypes

UNIXTime                    int64
Radiation                 float64
Temperature                 int64
Pressure                  float64
Humidity                    int64
WindDirection(Degrees)    float64
Speed                     float64
Month                       int64
Day                         int64
Year                        int64
Hour                        int64
Minute                      int64
Second                      int64
Sunrise_Hour                int64
Sunrise_Minute              int64
Sunset_Hour                 int64
Sunset_Minute               int64
dtype: object

### Splitting/Scaling 

In [18]:
y = df['Radiation'].copy()
X = df.drop('Radiation', axis=1).copy()

In [19]:
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [20]:
X

Unnamed: 0,UNIXTime,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Month,Day,Year,Hour,Minute,Second,Sunrise_Hour,Sunrise_Minute,Sunset_Hour,Sunset_Minute
0,-0.937753,-0.500439,0.678974,-0.616253,0.407620,-0.178738,-1.391540,1.512248,0.0,1.655482,1.589236,0.660811,0.0,-0.943425,1.369126,-1.525765
1,-0.937854,-0.500439,0.678974,-0.654730,0.400285,-0.823359,-1.391540,1.512248,0.0,1.655482,1.299687,0.429167,0.0,-0.943425,1.369126,-1.525765
2,-0.937953,-0.500439,0.678974,-0.693206,0.183490,-0.823359,-1.391540,1.512248,0.0,1.655482,1.010138,0.660811,0.0,-0.943425,1.369126,-1.525765
3,-0.938054,-0.500439,0.678974,-0.577776,-0.069497,-0.823359,-1.391540,1.512248,0.0,1.655482,0.720589,0.274737,0.0,-0.943425,1.369126,-1.525765
4,-0.938153,-0.500439,0.678974,-0.500823,-0.463407,-0.178738,-1.391540,1.512248,0.0,1.655482,0.431040,0.506381,0.0,-0.943425,1.369126,-1.525765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,0.845373,-1.145490,0.130250,1.038241,0.023209,0.145006,1.344003,-1.701824,0.0,-1.672098,-0.437606,-1.037912,0.0,0.860876,-0.730393,0.293766
32682,0.845273,-1.145490,-0.052658,1.038241,-0.309138,0.145006,1.344003,-1.701824,0.0,-1.672098,-0.727155,-1.269556,0.0,0.860876,-0.730393,0.293766
32683,0.845173,-1.145490,-0.052658,1.038241,0.020443,0.789627,1.344003,-1.701824,0.0,-1.672098,-1.016704,-1.269556,0.0,0.860876,-0.730393,0.293766
32684,0.845073,-1.145490,-0.052658,0.999764,0.248901,0.465884,1.344003,-1.701824,0.0,-1.672098,-1.306253,-1.192341,0.0,0.860876,-0.730393,0.293766


In [26]:
X = X.drop(['Year', 'Sunrise_Hour'], axis=1)
X

Unnamed: 0,UNIXTime,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Month,Day,Hour,Minute,Second,Sunrise_Minute,Sunset_Hour,Sunset_Minute
0,-0.937753,-0.500439,0.678974,-0.616253,0.407620,-0.178738,-1.391540,1.512248,1.655482,1.589236,0.660811,-0.943425,1.369126,-1.525765
1,-0.937854,-0.500439,0.678974,-0.654730,0.400285,-0.823359,-1.391540,1.512248,1.655482,1.299687,0.429167,-0.943425,1.369126,-1.525765
2,-0.937953,-0.500439,0.678974,-0.693206,0.183490,-0.823359,-1.391540,1.512248,1.655482,1.010138,0.660811,-0.943425,1.369126,-1.525765
3,-0.938054,-0.500439,0.678974,-0.577776,-0.069497,-0.823359,-1.391540,1.512248,1.655482,0.720589,0.274737,-0.943425,1.369126,-1.525765
4,-0.938153,-0.500439,0.678974,-0.500823,-0.463407,-0.178738,-1.391540,1.512248,1.655482,0.431040,0.506381,-0.943425,1.369126,-1.525765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,0.845373,-1.145490,0.130250,1.038241,0.023209,0.145006,1.344003,-1.701824,-1.672098,-0.437606,-1.037912,0.860876,-0.730393,0.293766
32682,0.845273,-1.145490,-0.052658,1.038241,-0.309138,0.145006,1.344003,-1.701824,-1.672098,-0.727155,-1.269556,0.860876,-0.730393,0.293766
32683,0.845173,-1.145490,-0.052658,1.038241,0.020443,0.789627,1.344003,-1.701824,-1.672098,-1.016704,-1.269556,0.860876,-0.730393,0.293766
32684,0.845073,-1.145490,-0.052658,0.999764,0.248901,0.465884,1.344003,-1.701824,-1.672098,-1.306253,-1.192341,0.860876,-0.730393,0.293766


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=200)

In [28]:
X_train.shape, X_test.shape, X_val.shape

((18304, 14), (9806, 14), (4576, 14))

In [29]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [30]:
dtrain

<xgboost.core.DMatrix at 0x76061ec28b50>

### Training

In [37]:
def get_model_rmse(params):
    model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=0)
    results = model.eval(dval)
    rmse = float(re.search(r'[\d.]+$', results).group(0))
    return rmse

In [38]:
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.00001, 10.0)
    max_depth = trial.suggest_int('max_depth', 4, 8)
    l1_reg = trial.suggest_loguniform('l1_reg', 0.00001, 10.0)
    l2_reg = trial.suggest_loguniform('l2_reg', 0.00001, 10.0)

    params = {'learning_rate': learning_rate, 'max_depth': max_depth, 'alpha': l1_reg, 'lambda': l2_reg}

    return get_model_rmse(params)

In [39]:
study = optuna.create_study()
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-03-30 10:37:25,480] A new study created in memory with name: no-name-72040383-d5d8-4878-bec5-823cc40cc706


  0%|          | 0/100 [00:00<?, ?it/s]

  learning_rate = trial.suggest_loguniform('learning_rate', 0.00001, 10.0)
  l1_reg = trial.suggest_loguniform('l1_reg', 0.00001, 10.0)
  l2_reg = trial.suggest_loguniform('l2_reg', 0.00001, 10.0)


[I 2025-03-30 10:37:25,933] Trial 0 finished with value: 142.38152827287917 and parameters: {'learning_rate': 0.012331243214187085, 'max_depth': 6, 'l1_reg': 0.09926000352150509, 'l2_reg': 0.0015426596791508758}. Best is trial 0 with value: 142.38152827287917.
[I 2025-03-30 10:37:26,619] Trial 1 finished with value: 105.17978010698015 and parameters: {'learning_rate': 0.01919430435964637, 'max_depth': 8, 'l1_reg': 0.08545364039043445, 'l2_reg': 5.705504732350006e-05}. Best is trial 1 with value: 105.17978010698015.
[I 2025-03-30 10:37:26,778] Trial 2 finished with value: 318.5835937554384 and parameters: {'learning_rate': 3.4698905875670265e-05, 'max_depth': 4, 'l1_reg': 0.024414458045417954, 'l2_reg': 0.0005416253804673675}. Best is trial 1 with value: 105.17978010698015.
[I 2025-03-30 10:37:27,014] Trial 3 finished with value: 317.1904220836364 and parameters: {'learning_rate': 9.09844459315777e-05, 'max_depth': 4, 'l1_reg': 1.1800944134069815, 'l2_reg': 1.346300216599482}. Best is t

In [41]:
best_params = study.best_params
best_params

{'learning_rate': 0.18680642691589489,
 'max_depth': 8,
 'l1_reg': 1.0521369682832528e-05,
 'l2_reg': 7.397123995890879}

In [42]:
model = xgb.train(best_params, dtrain, num_boost_round=10000, evals=[(dval, 'eval')], early_stopping_rounds=10)

[0]	eval-rmse:268.15351
[1]	eval-rmse:227.44220
[2]	eval-rmse:194.48263
[3]	eval-rmse:169.17011
[4]	eval-rmse:149.81739
[5]	eval-rmse:134.63231
[6]	eval-rmse:123.03539
[7]	eval-rmse:113.81258
[8]	eval-rmse:107.50600
[9]	eval-rmse:102.82537
[10]	eval-rmse:99.39182
[11]	eval-rmse:96.67288
[12]	eval-rmse:94.80953
[13]	eval-rmse:93.44560
[14]	eval-rmse:92.54637


Parameters: { "l1_reg", "l2_reg" } are not used.

  self.starting_round = model.num_boosted_rounds()


[15]	eval-rmse:91.49904
[16]	eval-rmse:90.38235
[17]	eval-rmse:89.77014
[18]	eval-rmse:89.40376
[19]	eval-rmse:88.94293
[20]	eval-rmse:88.80607
[21]	eval-rmse:88.60663
[22]	eval-rmse:88.60545
[23]	eval-rmse:88.53499
[24]	eval-rmse:88.52435
[25]	eval-rmse:88.44512
[26]	eval-rmse:88.24876
[27]	eval-rmse:88.17823
[28]	eval-rmse:88.27999
[29]	eval-rmse:88.18417
[30]	eval-rmse:88.10707
[31]	eval-rmse:87.96140
[32]	eval-rmse:87.88420
[33]	eval-rmse:87.62091
[34]	eval-rmse:87.56283
[35]	eval-rmse:87.51526
[36]	eval-rmse:87.42580
[37]	eval-rmse:87.40823
[38]	eval-rmse:87.37898
[39]	eval-rmse:87.41656
[40]	eval-rmse:87.41498
[41]	eval-rmse:87.36599
[42]	eval-rmse:87.36765
[43]	eval-rmse:87.27881
[44]	eval-rmse:87.23976
[45]	eval-rmse:87.18432
[46]	eval-rmse:87.12900
[47]	eval-rmse:87.13128
[48]	eval-rmse:86.98482
[49]	eval-rmse:86.97945
[50]	eval-rmse:86.93093
[51]	eval-rmse:86.92348
[52]	eval-rmse:86.94057
[53]	eval-rmse:86.95345
[54]	eval-rmse:86.87177
[55]	eval-rmse:86.65723
[56]	eval-rmse:8

### Results

In [43]:
y_true = np.array(y_test, dtype=float)
y_true

array([1.22, 1.22, 1.21, ..., 1.22, 1.22, 1.24])

In [44]:
y_pred = np.array(model.predict(dtest), dtype=float)
y_pred

array([ 0.50289249,  0.22582763, -0.8963173 , ...,  3.64432526,
        1.62027025,  0.03743032])

In [45]:
r2 = r2_score(y_true, y_pred)

print("R^2 Score: {:.4f}".format(r2))

R^2 Score: 0.9381
