## Import libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMRegressor

np.random.seed(42)

## Load source datasets

In [2]:
train = pd.read_csv("../input/mh-wsmlc/train.csv")
print(f"train: {train.shape}")
train.head()

train: (175296, 18)


Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
0,2009,1,1,0,0,0,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0
1,2009,1,1,0,30,0,0,0,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0
2,2009,1,1,1,0,0,0,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0
3,2009,1,1,1,30,0,0,0,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0
4,2009,1,1,2,0,0,0,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0


In [3]:
test = pd.read_csv("../input/mh-wsmlc/test.csv")
print(f"test: {test.shape}")
test.head()

test: (17520, 18)


Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,2019,1,1,0,0,7,18.4,18.8,1008,97.7,106.23,3.5,190,2.3,0,,,
1,2019,1,1,0,30,3,18.4,18.6,1008,98.92,112.36,3.5,187,2.5,0,,,
2,2019,1,1,1,0,3,18.2,18.5,1008,98.35,118.58,3.5,184,2.8,0,,,
3,2019,1,1,1,30,3,18.2,18.3,1008,99.58,124.86,3.5,185,3.0,0,,,
4,2019,1,1,2,0,0,18.0,18.0,1008,99.71,131.2,3.6,186,3.1,0,,,


## Feature Engineering

In [4]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [5]:
train['Date'] = train.apply(lambda row: pd.to_datetime(str(row['Year']).split('.')[0] + \
                                                       str(row['Month']).split('.')[0].zfill(2) + \
                                                       str(row['Day']).split('.')[0].zfill(2), format='%Y%m%d'), axis=1)

train['Quarter'] = train['Date'].apply(lambda x: pd.to_datetime(x).quarter)
train['Week'] = train['Date'].apply(lambda x: pd.to_datetime(x).week)
train['DayofWeek'] = train['Date'].apply(lambda x: pd.to_datetime(x).dayofweek)
train['isWeekend'] = np.where(train['DayofWeek'].isin([5,6]),1,0)
train['season'] = train['Month'].apply(lambda x: 0 if x in [2,3] else 1 if x in [4,5,6] else 2 if x in [7,8] else 3 if x in [9,10,11] else 4)

train.drop(['Date'], axis=1, inplace=True)
train.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,...,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Quarter,Week,DayofWeek,isWeekend,season
0,2009,1,1,0,0,0,0,0,0,0.0,...,106.15,0.499,346.1,3.1,0,1,1,3,0,4
1,2009,1,1,0,30,0,0,0,0,1.0,...,112.28,0.49,346.1,3.1,0,1,1,3,0,4
2,2009,1,1,1,0,0,0,0,4,0.0,...,118.5,0.482,347.9,3.2,0,1,1,3,0,4
3,2009,1,1,1,30,0,0,0,4,0.0,...,124.78,0.478,347.9,3.1,0,1,1,3,0,4
4,2009,1,1,2,0,0,0,0,4,0.0,...,131.12,0.475,350.0,3.0,0,1,1,3,0,4


In [6]:
test['Date'] = test.apply(lambda row: pd.to_datetime(str(row['Year']).split('.')[0] + \
                                                       str(row['Month']).split('.')[0].zfill(2) + \
                                                       str(row['Day']).split('.')[0].zfill(2), format='%Y%m%d'), axis=1)

test['Quarter'] = test['Date'].apply(lambda x: pd.to_datetime(x).quarter)
test['Week'] = test['Date'].apply(lambda x: pd.to_datetime(x).week)
test['DayofWeek'] = test['Date'].apply(lambda x: pd.to_datetime(x).dayofweek)
test['isWeekend'] = np.where(test['DayofWeek'].isin([5,6]),1,0)
test['season'] = test['Month'].apply(lambda x: 0 if x in [2,3] else 1 if x in [4,5,6] else 2 if x in [7,8] else 3 if x in [9,10,11] else 4)

test.drop(['Date'], axis=1, inplace=True)
test.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,...,Wind Speed,Fill Flag,Clearsky DHI,Clearsky DNI,Clearsky GHI,Quarter,Week,DayofWeek,isWeekend,season
0,2019,1,1,0,0,7,18.4,18.8,1008,97.7,...,2.3,0,,,,1,1,1,0,4
1,2019,1,1,0,30,3,18.4,18.6,1008,98.92,...,2.5,0,,,,1,1,1,0,4
2,2019,1,1,1,0,3,18.2,18.5,1008,98.35,...,2.8,0,,,,1,1,1,0,4
3,2019,1,1,1,30,3,18.2,18.3,1008,99.58,...,3.0,0,,,,1,1,1,0,4
4,2019,1,1,2,0,0,18.0,18.0,1008,99.71,...,3.1,0,,,,1,1,1,0,4


In [7]:
train['Dew Point / Temperature'] = train.apply(lambda row: 0 if row['Temperature']==0 else row['Dew Point']/row['Temperature'], axis=1)
train['Dew Point / Pressure'] = train['Dew Point']/train['Pressure']
train['Precipitable Water / Dew Point'] = train.apply(lambda row: 0 if row['Dew Point']==0 else row['Precipitable Water']/row['Dew Point'], axis=1)
train['Temperature / Pressure'] = train['Temperature']/train['Pressure']
train['Humidity * Water'] = train['Relative Humidity'] * train['Precipitable Water']
train['Humidity * Wind'] = train['Relative Humidity'] * train['Wind Speed']

train = encode(train, 'Month', train['Month'].max())
train = encode(train, 'Quarter', train['Quarter'].max())
train = encode(train, 'season', train['season'].max())
train = encode(train, 'Week', train['Week'].max())
train = encode(train, 'DayofWeek', train['DayofWeek'].max())
train = encode(train, 'Day', train['Day'].max())
train = encode(train, 'Hour', train['Hour'].max())
train = encode(train, 'Solar Zenith Angle', 90)
train = encode(train, 'Wind Direction', 180)

train.drop(['Month','Quarter','season','Day',
            'Week','DayofWeek','Wind Direction'], axis=1, inplace=True)
train.head()

Unnamed: 0,Year,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,...,DayofWeek_sin,DayofWeek_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Solar Zenith Angle_sin,Solar Zenith Angle_cos,Wind Direction_sin,Wind Direction_cos
0,2009,0,0,0,0,0,0,0.0,5.0,1010,...,1.224647e-16,-1.0,0.201299,0.97953,0.0,1.0,0.903335,0.428935,-0.466387,0.884581
1,2009,0,30,0,0,0,0,1.0,5.0,1010,...,1.224647e-16,-1.0,0.201299,0.97953,0.0,1.0,0.999882,0.015358,-0.466387,0.884581
2,2009,1,0,0,0,0,4,0.0,5.0,1010,...,1.224647e-16,-1.0,0.201299,0.97953,0.269797,0.962917,0.913545,-0.406737,-0.409923,0.91212
3,2009,1,30,0,0,0,4,0.0,4.0,1010,...,1.224647e-16,-1.0,0.201299,0.97953,0.269797,0.962917,0.654477,-0.756082,-0.409923,0.91212
4,2009,2,0,0,0,0,4,0.0,4.0,1010,...,1.224647e-16,-1.0,0.201299,0.97953,0.519584,0.854419,0.267575,-0.963537,-0.34202,0.939693


In [8]:
test['Dew Point / Temperature'] = test.apply(lambda row: 0 if row['Temperature']==0 else row['Dew Point']/row['Temperature'], axis=1)
test['Dew Point / Pressure'] = test['Dew Point']/test['Pressure']
test['Precipitable Water / Dew Point'] = test.apply(lambda row: 0 if row['Dew Point']==0 else row['Precipitable Water']/row['Dew Point'], axis=1)
test['Temperature / Pressure'] = test['Temperature']/test['Pressure']
test['Humidity * Water'] = test['Relative Humidity'] * test['Precipitable Water']
test['Humidity * Wind'] = test['Relative Humidity'] * test['Wind Speed']

test = encode(test, 'Month', test['Month'].max())
test = encode(test, 'Quarter', test['Quarter'].max())
test = encode(test, 'season', test['season'].max())
test = encode(test, 'Week', test['Week'].max())
test = encode(test, 'DayofWeek', test['DayofWeek'].max())
test = encode(test, 'Day', test['Day'].max())
test = encode(test, 'Hour', test['Hour'].max())
test = encode(test, 'Solar Zenith Angle', 90)
test = encode(test, 'Wind Direction', 180)

test.drop(['Month','Quarter','season','Day',
            'Week','DayofWeek','Wind Direction'], axis=1, inplace=True)
test.head()

Unnamed: 0,Year,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,...,DayofWeek_sin,DayofWeek_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Solar Zenith Angle_sin,Solar Zenith Angle_cos,Wind Direction_sin,Wind Direction_cos
0,2019,0,0,7,18.4,18.8,1008,97.7,106.23,3.5,...,0.866025,0.5,0.201299,0.97953,0.0,1.0,0.905717,0.423883,0.34202,0.939693
1,2019,0,30,3,18.4,18.6,1008,98.92,112.36,3.5,...,0.866025,0.5,0.201299,0.97953,0.0,1.0,0.999952,0.009774,0.241922,0.970296
2,2019,1,0,3,18.2,18.5,1008,98.35,118.58,3.5,...,0.866025,0.5,0.201299,0.97953,0.269797,0.962917,0.91126,-0.411832,0.139173,0.990268
3,2019,1,30,3,18.2,18.3,1008,99.58,124.86,3.5,...,0.866025,0.5,0.201299,0.97953,0.269797,0.962917,0.650244,-0.759725,0.173648,0.984808
4,2019,2,0,0,18.0,18.0,1008,99.71,131.2,3.6,...,0.866025,0.5,0.201299,0.97953,0.519584,0.854419,0.262189,-0.965016,0.207912,0.978148


In [9]:
def shift_features(df, features, shifts):
    for feature in features:
        for shift in shifts:
            df[feature+"-"+str(shift)+"abs_shift"] = df[feature] - df[feature].shift(periods=shift, fill_value=0)
            df[feature+"+"+str(shift)+"abs_shift"] = df[feature] - df[feature].shift(periods=-shift, fill_value=0)

    df.replace(to_replace=np.inf, value=0, inplace=True)
    return df

In [10]:
features = ['Dew Point', 'Temperature', 'Pressure', 'Relative Humidity', 'Precipitable Water', 
            'Wind Speed', 'Solar Zenith Angle_sin', 'Solar Zenith Angle_cos', 'Wind Direction_sin',
            'Wind Direction_cos', 'Dew Point / Temperature', 'Dew Point / Pressure', 
            'Temperature / Pressure', 'Precipitable Water / Dew Point', 'Humidity * Water', 
            'Humidity * Wind']

shifts = [1, 2, 3, 6, 9, 12, 18, 24, 36, 48, 72]

train = shift_features(train, features, shifts)
train.head()

  after removing the cwd from sys.path.
  """


Unnamed: 0,Year,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,...,Humidity * Wind-18abs_shift,Humidity * Wind+18abs_shift,Humidity * Wind-24abs_shift,Humidity * Wind+24abs_shift,Humidity * Wind-36abs_shift,Humidity * Wind+36abs_shift,Humidity * Wind-48abs_shift,Humidity * Wind+48abs_shift,Humidity * Wind-72abs_shift,Humidity * Wind+72abs_shift
0,2009,0,0,0,0,0,0,0.0,5.0,1010,...,233.554,39.25,233.554,12.658,233.554,76.63,233.554,64.861,233.554,123.554
1,2009,0,30,0,0,0,0,1.0,5.0,1010,...,250.511,56.185,250.511,5.58,250.511,106.73,250.511,69.554,250.511,90.511
2,2009,1,0,0,0,0,4,0.0,5.0,1010,...,250.464,54.159,250.464,-31.704,250.464,106.407,250.464,65.538,250.464,40.464
3,2009,1,30,0,0,0,4,0.0,4.0,1010,...,242.637,46.263,242.637,-86.601,242.637,112.521,242.637,66.577,242.637,25.816
4,2009,2,0,0,0,0,4,0.0,4.0,1010,...,229.35,17.19,229.35,-92.586,229.35,98.57,229.35,49.79,229.35,-9.375


In [11]:
test = shift_features(test, features, shifts)
test.head()

  after removing the cwd from sys.path.
  """


Unnamed: 0,Year,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,...,Humidity * Wind-18abs_shift,Humidity * Wind+18abs_shift,Humidity * Wind-24abs_shift,Humidity * Wind+24abs_shift,Humidity * Wind-36abs_shift,Humidity * Wind+36abs_shift,Humidity * Wind-48abs_shift,Humidity * Wind+48abs_shift,Humidity * Wind-72abs_shift,Humidity * Wind+72abs_shift
0,2019,0,0,7,18.4,18.8,1008,97.7,106.23,3.5,...,224.71,-93.786,224.71,-44.264,224.71,-117.81,224.71,104.71,224.71,165.382
1,2019,0,30,3,18.4,18.6,1008,98.92,112.36,3.5,...,247.3,-67.196,247.3,-23.04,247.3,-86.657,247.3,137.3,247.3,180.254
2,2019,1,0,3,18.2,18.5,1008,98.35,118.58,3.5,...,275.38,-41.58,275.38,-11.198,275.38,-48.532,275.38,165.38,275.38,187.486
3,2019,1,30,3,18.2,18.3,1008,99.58,124.86,3.5,...,298.74,-8.625,298.74,16.65,298.74,-11.796,298.74,188.74,298.74,195.956
4,2019,2,0,0,18.0,18.0,1008,99.71,131.2,3.6,...,309.101,10.031,309.101,18.611,309.101,18.767,309.101,209.101,309.101,194.369


In [12]:
features = test.columns.tolist()
features.remove('Clearsky DHI')
features.remove('Clearsky DNI')
features.remove('Clearsky GHI')
features.remove('Hour')
features.remove('Solar Zenith Angle')
len(features)

387

In [13]:
cat_cols = ['Year','Minute','isWeekend','Pressure','Fill Flag','Cloud Type']

train[cat_cols] = train[cat_cols].astype(int)
test[cat_cols] = test[cat_cols].astype(int)
cat_cols_indices = [train[features].columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

[0, 1, 10, 5, 9, 2]


In [14]:
num_cols = [col for col in features if col not in cat_cols]

scaler = RobustScaler().fit(train[num_cols])
train[num_cols] = scaler.transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [15]:
train_org = train.copy()
train = train[train['Solar Zenith Angle']<93].reset_index(drop=True).copy()
train = train[~train['Hour'].isin([1,2,3,4,5,6,7,8,9])].reset_index(drop=True).copy()
print(f"train_org: {train_org.shape} \ntrain: {train.shape}")

train_org: (175296, 392) 
train: (91563, 392)


In [16]:
test_org = test.copy()
test = test[test['Solar Zenith Angle']<93].reset_index(drop=True).copy()
test = test[~test['Hour'].isin([1,2,3,4,5,6,7,8,9])].reset_index(drop=True).copy()
print(f"test_org: {test_org.shape} \ntest: {test.shape}")

test_org: (17520, 392) 
test: (9150, 392)


## LightGBM

In [17]:
for target in ['Clearsky DHI','Clearsky DNI','Clearsky GHI']:
    print(f"Training for target: {target}\n")
    
    splits = 10
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    oof_preds = np.zeros((train.shape[0],))
    model_preds = 0
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train[features], train['Year'])):
        X_train, X_valid = train.loc[train_idx][features], train.loc[valid_idx][features]
        y_train, y_valid = train.loc[train_idx][target], train.loc[valid_idx][target]
        
        model = LGBMRegressor(
            boosting_type='gbdt', 
            num_leaves=52, 
            max_depth=6, 
            learning_rate=0.075, 
            n_estimators=5000, 
            objective='regression',
            metric='rmse',
            min_child_samples=20, 
            subsample=0.78, 
            subsample_freq=4, 
            colsample_bytree=0.58, 
            reg_alpha=10.0, 
            reg_lambda=0.5, 
            random_state=42
        )
        
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                  categorical_feature=cat_cols_indices,
                  early_stopping_rounds=50, verbose=1000)
        
        model_preds += model.predict(test[features], num_iteration=model.best_iteration_) / splits
        oof_preds[valid_idx] = model.predict(X_valid, num_iteration=model.best_iteration_)
        
        print(f"\nFold-{fold+1} | MSE: {mean_squared_error(y_valid, oof_preds[valid_idx])}\n")
    
    train[target+'_lgb'] = oof_preds
    test[target+'_lgb'] = model_preds
    
    model_mse = mean_squared_error(train[target], oof_preds)
    print(f"All Folds | MSE: {model_mse}\n")

Training for target: Clearsky DHI

[1000]	training's rmse: 5.9196	valid_1's rmse: 9.71703
[2000]	training's rmse: 3.1786	valid_1's rmse: 8.05282
[3000]	training's rmse: 2.00306	valid_1's rmse: 7.40173
[4000]	training's rmse: 1.37616	valid_1's rmse: 7.10005
[5000]	training's rmse: 1.00644	valid_1's rmse: 6.95402

Fold-1 | MSE: 48.358325888977454

[1000]	training's rmse: 5.9033	valid_1's rmse: 9.69838
[2000]	training's rmse: 3.1662	valid_1's rmse: 8.04582
[3000]	training's rmse: 2.00022	valid_1's rmse: 7.42123
[4000]	training's rmse: 1.38512	valid_1's rmse: 7.13983
[5000]	training's rmse: 1.0063	valid_1's rmse: 6.98973

Fold-2 | MSE: 48.856290836839605

[1000]	training's rmse: 5.91838	valid_1's rmse: 9.934
[2000]	training's rmse: 3.16923	valid_1's rmse: 8.10088
[3000]	training's rmse: 2.00901	valid_1's rmse: 7.43985
[4000]	training's rmse: 1.38455	valid_1's rmse: 7.13671
[5000]	training's rmse: 1.00457	valid_1's rmse: 6.96884

Fold-3 | MSE: 48.56476337053676

[1000]	training's rmse: 5.95

In [18]:
Xtrain = pd.merge(
    train_org, 
    train,
    how='left',
    on=features,
    sort=False
)

print(f"Xtrain: {Xtrain.shape}")
Xtrain.head()

Xtrain: (175296, 400)


Unnamed: 0,Year,Hour_x,Minute,Clearsky DHI_x,Clearsky DNI_x,Clearsky GHI_x,Cloud Type,Dew Point,Temperature,Pressure,...,Humidity * Wind-72abs_shift,Humidity * Wind+72abs_shift,Hour_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Solar Zenith Angle_y,Clearsky DHI_lgb,Clearsky DNI_lgb,Clearsky GHI_lgb
0,2009,0,0,0,0,0,0,-1.153846,-1.153846,1010,...,1.449289,0.765293,,,,,,,,
1,2009,0,30,0,0,0,0,-1.076923,-1.153846,1010,...,1.554479,0.560281,,,,,,,,
2,2009,1,0,0,0,0,4,-1.153846,-1.153846,1010,...,1.554188,0.24977,,,,,,,,
3,2009,1,30,0,0,0,4,-1.153846,-1.230769,1010,...,1.505634,0.158888,,,,,,,,
4,2009,2,0,0,0,0,4,-1.153846,-1.230769,1010,...,1.42321,-0.05945,,,,,,,,


In [19]:
Xtest = pd.merge(
    test_org, 
    test,
    how='left',
    on=features,
    sort=False
)

print(f"Xtest: {Xtest.shape}")
Xtest.head()

Xtest: (17520, 400)


Unnamed: 0,Year,Hour_x,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle_x,Precipitable Water,...,Humidity * Wind-72abs_shift,Humidity * Wind+72abs_shift,Hour_y,Solar Zenith Angle_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Clearsky DHI_lgb,Clearsky DNI_lgb,Clearsky GHI_lgb
0,2019,0,0,7,0.261538,-0.092308,1008,0.416255,106.23,0.235775,...,1.394427,1.02481,,,,,,,,
1,2019,0,30,3,0.261538,-0.107692,1008,0.453956,112.36,0.235775,...,1.53456,1.117081,,,,,,,,
2,2019,1,0,3,0.246154,-0.115385,1008,0.436341,118.58,0.235775,...,1.70875,1.161952,,,,,,,,
3,2019,1,30,3,0.246154,-0.130769,1008,0.474351,124.86,0.235775,...,1.85366,1.214503,,,,,,,,
4,2019,2,0,0,0.230769,-0.153846,1008,0.478368,131.2,0.272964,...,1.917933,1.204656,,,,,,,,


In [20]:
Xtrain['Clearsky DHI_lgb'] = Xtrain.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DHI_lgb'], axis=1)
Xtrain['Clearsky DNI_lgb'] = Xtrain.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DNI_lgb'], axis=1)
Xtrain['Clearsky GHI_lgb'] = Xtrain.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky GHI_lgb'], axis=1)

Xtrain['Clearsky DNI_lgb'] = Xtrain.apply(lambda row: 0 if row['Clearsky DHI_lgb']==0 else row['Clearsky DNI_lgb'], axis=1)
Xtrain['Clearsky GHI_lgb'] = Xtrain.apply(lambda row: 0 if row['Clearsky DHI_lgb']==0 else row['Clearsky GHI_lgb'], axis=1)

Xtrain.head()

Unnamed: 0,Year,Hour_x,Minute,Clearsky DHI_x,Clearsky DNI_x,Clearsky GHI_x,Cloud Type,Dew Point,Temperature,Pressure,...,Humidity * Wind-72abs_shift,Humidity * Wind+72abs_shift,Hour_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Solar Zenith Angle_y,Clearsky DHI_lgb,Clearsky DNI_lgb,Clearsky GHI_lgb
0,2009,0,0,0,0,0,0,-1.153846,-1.153846,1010,...,1.449289,0.765293,,,,,,0.0,0.0,0.0
1,2009,0,30,0,0,0,0,-1.076923,-1.153846,1010,...,1.554479,0.560281,,,,,,0.0,0.0,0.0
2,2009,1,0,0,0,0,4,-1.153846,-1.153846,1010,...,1.554188,0.24977,,,,,,0.0,0.0,0.0
3,2009,1,30,0,0,0,4,-1.153846,-1.230769,1010,...,1.505634,0.158888,,,,,,0.0,0.0,0.0
4,2009,2,0,0,0,0,4,-1.153846,-1.230769,1010,...,1.42321,-0.05945,,,,,,0.0,0.0,0.0


In [21]:
print(f"""
Clearsky DHI: {mean_squared_error(Xtrain['Clearsky DHI_x'], Xtrain['Clearsky DHI_lgb'])}
Clearsky DNI: {mean_squared_error(Xtrain['Clearsky DNI_x'], Xtrain['Clearsky DNI_lgb'])}
Clearsky GHI: {mean_squared_error(Xtrain['Clearsky GHI_x'], Xtrain['Clearsky GHI_lgb'])}
"""
)


Clearsky DHI: 25.26175035192573
Clearsky DNI: 227.7344457447344
Clearsky GHI: 7.75056636434122



In [22]:
Xtest['Clearsky DHI_lgb'] = Xtest.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DHI_lgb'], axis=1)
Xtest['Clearsky DNI_lgb'] = Xtest.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DNI_lgb'], axis=1)
Xtest['Clearsky GHI_lgb'] = Xtest.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky GHI_lgb'], axis=1)

Xtest['Clearsky DNI_lgb'] = Xtest.apply(lambda row: 0 if row['Clearsky DHI_lgb']==0 else row['Clearsky DNI_lgb'], axis=1)
Xtest['Clearsky GHI_lgb'] = Xtest.apply(lambda row: 0 if row['Clearsky DHI_lgb']==0 else row['Clearsky GHI_lgb'], axis=1)

Xtest.head()

Unnamed: 0,Year,Hour_x,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle_x,Precipitable Water,...,Humidity * Wind-72abs_shift,Humidity * Wind+72abs_shift,Hour_y,Solar Zenith Angle_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Clearsky DHI_lgb,Clearsky DNI_lgb,Clearsky GHI_lgb
0,2019,0,0,7,0.261538,-0.092308,1008,0.416255,106.23,0.235775,...,1.394427,1.02481,,,,,,0.0,0.0,0.0
1,2019,0,30,3,0.261538,-0.107692,1008,0.453956,112.36,0.235775,...,1.53456,1.117081,,,,,,0.0,0.0,0.0
2,2019,1,0,3,0.246154,-0.115385,1008,0.436341,118.58,0.235775,...,1.70875,1.161952,,,,,,0.0,0.0,0.0
3,2019,1,30,3,0.246154,-0.130769,1008,0.474351,124.86,0.235775,...,1.85366,1.214503,,,,,,0.0,0.0,0.0
4,2019,2,0,0,0.230769,-0.153846,1008,0.478368,131.2,0.272964,...,1.917933,1.204656,,,,,,0.0,0.0,0.0


In [23]:
Xtrain.to_pickle('./train_lgb.pkl', compression='gzip')
Xtest.to_pickle('./test_lgb.pkl', compression='gzip')

## Create submission file

In [24]:
submission = pd.read_csv("../input/mh-wsmlc/sample_submission.csv")
submission['Clearsky DHI'] = Xtest['Clearsky DHI_lgb']
submission['Clearsky DNI'] = Xtest['Clearsky DNI_lgb']
submission['Clearsky GHI'] = Xtest['Clearsky GHI_lgb']
submission.to_csv("./lgb_submission.csv", index=False)
submission.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
