In [None]:
#Library for importing csv files
import pandas as pd

#Library for matrix operations
import numpy as np

#Open source library for handling global holidays
import holidays 

#Set random seeds to ensure that the model can be reproduced
import random
np.random.seed(2023)
random.seed(2023)

In [None]:
train_df=pd.read_csv("/kaggle/input/predict-of-electric-power/prediction of power/train_df.csv")
print(f"len(train_df):{len(train_df)}")
train_df.head()

In [None]:
test_df=pd.read_csv("/kaggle/input/predict-of-electric-power/prediction of power/test_df.csv")
print(f"len(test_df):{len(test_df)}")
test_df.head()

In [None]:
total_df=pd.concat((train_df,test_df),axis=0)

#forward_fill: Missing values ​​are filled according to the previous value
total_df.fillna(method='ffill', inplace=True)

total_df['sin_month']=np.sin(2*np.pi*total_df['month']/12)
total_df['cos_month']=np.cos(2*np.pi*total_df['month']/12)

total_df['spring']=(total_df['month']>=3)&(total_df['month']<=5)
total_df['summer']=(total_df['month']>=6)&(total_df['month']<=8)
total_df['fall']=(total_df['month']>=9)&(total_df['month']<=11)
total_df['winter']=(total_df['month']==12)&(total_df['month']<=2)

total_df['sin_day']=np.sin(2*np.pi*total_df['day']/30)
total_df['cos_day']=np.cos(2*np.pi*total_df['day']/30)

total_df['up_of_month']=(total_df['day']<=10)
total_df['down_of_month']=(total_df['day']>20)
total_df['morning']=(total_df['hour']>5)&(total_df['hour']<=12)
total_df['afternoon']=(total_df['hour']>12)&(total_df['hour']<=19)
total_df['evening']=1-total_df['morning']-total_df['afternoon']

total_df['ds']=pd.to_datetime(total_df[['year','month','day','hour']])
total_df['weekday']=total_df['ds'].dt.weekday
total_df['sin_week']=np.sin(2*np.pi*total_df['weekday']/7)
total_df['cos_week']=np.cos(2*np.pi*total_df['weekday']/7)
total_df['is_friday']=(total_df['weekday']==4)
total_df['is_weekend']=(total_df['weekday']==5)|(total_df['weekday']==6)
total_df['day_of_year']=total_df['ds'].dt.dayofyear

#GetChina's holidays
holiday = holidays.China()
total_df['ds']=pd.to_datetime(total_df[['year','month','day','hour']])
ds=total_df['ds'].values
is_holiday = [0 if holiday.get(pd.to_datetime(ds[i]))=="None" else 1 for i in range(len(ds))]
total_df['is_holiday']=is_holiday 

total_df['sin_hour']=np.sin(2*np.pi*total_df['hour']/24)
total_df['cos_hour']=np.cos(2*np.pi*total_df['hour']/24)


total_df.drop(['ds'],axis=1,inplace=True)
total_df.head()

In [None]:
train_df=total_df[:len(train_df)]
keys=train_df.keys()
for key in keys:
    values=np.unique(train_df[key].values)#Get the value of each column
    
    if len(values)<=500 and key!="power":
        #print(f"key:{key},values:{values}") 
        key_target=train_df['power'].groupby([train_df[key]]).mean()
        keys=key_target.keys().values
        target=key_target.values
        key_target=pd.DataFrame({key:keys,key+"_target_mean":target})
        total_df=pd.merge(total_df,key_target,on=key,how="left")
        key_target=train_df['power'].groupby([train_df[key]]).std()
        keys=key_target.keys().values
        target=key_target.values
        key_target=pd.DataFrame({key:keys,key+"_target_std":target})
        total_df=pd.merge(total_df,key_target,on=key,how="left")

#Extract all data according to 25% and 75%, low and high will be out_of_memory
train_df=total_df[:len(train_df)]
test_df=total_df[len(train_df):]
del total_df
print(f"total_feature_counts:{len(train_df.keys().values)}")

In [None]:
from sklearn.model_selection import KFold #Import the k-fold cross validation function in the machine learning library
from lightgbm import LGBMRegressor #Import the integrated learning algorithm lightgbm
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

In [None]:
print("start fit.")
folds = 10 #Divide the data into 10 parts
y=train_df['power']
X=train_df.drop(['power'],axis=1)

train_RMSE=[]
valid_RMSE=[]
# Store the list of learned models
models = []

#Shuffle the data set randomly and divide it into folds
kf = KFold(n_splits=folds, shuffle=True, random_state=2023) 

#Divide x_train into a training set and a verification set in a ratio of 9:1, and take out the subscripts
for train_index, valid_index in kf.split(X):
    
    #Get the data of the training set and validation set based on the subscripts
    x_train_cv = X.iloc[train_index]
    y_train_cv = y.iloc[train_index]
    x_valid_cv =X.iloc[valid_index]
    y_valid_cv = y.iloc[valid_index]
    
    #Call the LightGBM regression model and add parameters
    model = LGBMRegressor(colsample_bytree=0.6503468706312049,
              learning_rate=0.020010277043886332, max_bin=127,
              min_child_samples=8, n_estimators=1001,
              early_stopping_rounds = 100, #If the accuracy has not improved after training for 100 times, stop training.
              num_leaves=402,reg_alpha=0.01631050699150689, 
              reg_lambda=0.01300300057057842,verbose=-1)

    
    #Use x_train_cv to train the model, and use x_train_cv and x_valid_cv to evaluate together
    model.fit(
        x_train_cv, 
        y_train_cv, 
        eval_set = [(x_train_cv, y_train_cv), (x_valid_cv, y_valid_cv)], 
        verbose = 100, #Iterate 100 times and output a result
    )
    
    #Predict the training set
    y_pred_train = model.predict(x_train_cv, num_iteration=model.best_iteration_)        
    #Predict on the validation set
    y_pred_valid = model.predict(x_valid_cv, num_iteration=model.best_iteration_) 
    
    train_rmse=RMSE(y_pred_train,y_train_cv)
    valid_rmse=RMSE(y_pred_valid,y_valid_cv)
    
    train_RMSE.append(train_rmse)
    valid_RMSE.append(valid_rmse)
    #Save model into list
    models.append(model)
    print(f"train_RMSE:{train_RMSE},valid_RMSE:{valid_RMSE}")

train_RMSE=np.array(train_RMSE)
valid_RMSE=np.array(valid_RMSE)

print(f"mean_train_RMSE:{np.mean(train_RMSE)}")
print(f"mean_valid_RMSE:{np.mean(valid_RMSE)}")

In [None]:
test_X=test_df.drop(['power'],axis=1).values
#Use each saved model to predict x_test once, and then take the average
preds_test = []

for model in models:
    
    pred = model.predict(test_X, num_iteration=model.best_iteration_)
    
    preds_test.append(pred)
    
#Convert the prediction results into np.array
preds_test_np = np.array(preds_test)
#Average the prediction results of each model by column
test_pred= preds_test_np.mean(axis=0)
test_pred=np.where(test_pred<=0,0,test_pred)
test_pred[:50]

In [None]:
submission=pd.read_csv("/kaggle/input/predict-of-electric-power/prediction of power/sample_submission.csv")
submission['power']=test_pred
submission.to_csv("baseline.csv",index=None)
submission.head()