In [1]:
#Library for importing csv files
import pandas as pd

#Library for matrix operations
import numpy as np

#Open source library for handling global holidays
import holidays

#Set random seeds to ensure that the model can be reproduced
import random
np.random.seed(2023)
random.seed(2023)

In [4]:
train_df=pd.read_csv("train_df.csv")
print(f"len(train_df):{len(train_df)}")
train_df.head()

len(train_df):263160


Unnamed: 0,id,id_encode,hour,f1,f2,parking_free,year,month,day,power
0,0,0.0,0.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0
1,1,0.0,1.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0
2,2,0.0,2.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0
3,3,0.0,3.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0
4,4,0.0,4.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0


In [5]:
test_df=pd.read_csv("test_df.csv")
print(f"len(test_df):{len(test_df)}")
test_df.head()

len(test_df):17136


Unnamed: 0,id,id_encode,hour,f1,f2,parking_free,year,month,day
0,8401,0,0,2,178.0,1,2023,4,1
1,8402,0,1,2,178.0,1,2023,4,1
2,8403,0,2,2,178.0,1,2023,4,1
3,8404,0,3,2,178.0,1,2023,4,1
4,8405,0,4,2,178.0,1,2023,4,1


In [7]:
total_df=pd.concat((train_df,test_df),axis=0)

#forward_fill: Missing values ​​are filled according to the previous value
total_df.fillna(method='ffill', inplace=True)

total_df['sin_month']=np.sin(2*np.pi*total_df['month']/12)
total_df['cos_month']=np.cos(2*np.pi*total_df['month']/12)

total_df['spring']=(total_df['month']>=3)&(total_df['month']<=5)
total_df['summer']=(total_df['month']>=6)&(total_df['month']<=8)
total_df['fall']=(total_df['month']>=9)&(total_df['month']<=11)
total_df['winter']=(total_df['month']==12)&(total_df['month']<=2)

total_df['sin_day']=np.sin(2*np.pi*total_df['day']/30)
total_df['cos_day']=np.cos(2*np.pi*total_df['day']/30)

total_df['up_of_month']=(total_df['day']<=10)
total_df['down_of_month']=(total_df['day']>20)
total_df['morning']=(total_df['hour']>5)&(total_df['hour']<=12)
total_df['afternoon']=(total_df['hour']>12)&(total_df['hour']<=19)
total_df['evening']=1-total_df['morning']-total_df['afternoon']

total_df['ds']=pd.to_datetime(total_df[['year','month','day','hour']])
total_df['weekday']=total_df['ds'].dt.weekday
total_df['sin_week']=np.sin(2*np.pi*total_df['weekday']/7)
total_df['cos_week']=np.cos(2*np.pi*total_df['weekday']/7)
total_df['is_friday']=(total_df['weekday']==4)
total_df['is_weekend']=(total_df['weekday']==5)|(total_df['weekday']==6)
total_df['day_of_year']=total_df['ds'].dt.dayofyear

#GetChina's holidays
# holiday = holidays.China()
# total_df['ds']=pd.to_datetime(total_df[['year','month','day','hour']])
# ds=total_df['ds'].values
# is_holiday = [0 if holiday.get(pd.to_datetime(ds[i]))=="None" else 1 for i in range(len(ds))]
# total_df['is_holiday']=is_holiday

total_df['sin_hour']=np.sin(2*np.pi*total_df['hour']/24)
total_df['cos_hour']=np.cos(2*np.pi*total_df['hour']/24)


total_df.drop(['ds'],axis=1,inplace=True)
total_df.head()

Unnamed: 0,id,id_encode,hour,f1,f2,parking_free,year,month,day,power,...,afternoon,evening,weekday,sin_week,cos_week,is_friday,is_weekend,day_of_year,sin_hour,cos_hour
0,0,0.0,0.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0,...,False,1,4,-0.433884,-0.900969,True,False,105,0.0,1.0
1,1,0.0,1.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0,...,False,1,4,-0.433884,-0.900969,True,False,105,0.258819,0.965926
2,2,0.0,2.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0,...,False,1,4,-0.433884,-0.900969,True,False,105,0.5,0.866025
3,3,0.0,3.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0,...,False,1,4,-0.433884,-0.900969,True,False,105,0.707107,0.707107
4,4,0.0,4.0,0.0,0.0,1.0,2022.0,4.0,15.0,0.0,...,False,1,4,-0.433884,-0.900969,True,False,105,0.866025,0.5


In [8]:
train_df=total_df[:len(train_df)]
keys=train_df.keys()
for key in keys:
    values=np.unique(train_df[key].values)#Get the value of each column

    if len(values)<=500 and key!="power":
        #print(f"key:{key},values:{values}")
        key_target=train_df['power'].groupby([train_df[key]]).mean()
        keys=key_target.keys().values
        target=key_target.values
        key_target=pd.DataFrame({key:keys,key+"_target_mean":target})
        total_df=pd.merge(total_df,key_target,on=key,how="left")
        key_target=train_df['power'].groupby([train_df[key]]).std()
        keys=key_target.keys().values
        target=key_target.values
        key_target=pd.DataFrame({key:keys,key+"_target_std":target})
        total_df=pd.merge(total_df,key_target,on=key,how="left")

#Extract all data according to 25% and 75%, low and high will be out_of_memory
train_df=total_df[:len(train_df)]
test_df=total_df[len(train_df):]
del total_df
print(f"total_feature_counts:{len(train_df.keys().values)}")

total_feature_counts:87


In [9]:
from sklearn.model_selection import KFold #Import the k-fold cross validation function in the machine learning library
from lightgbm import LGBMRegressor #Import the integrated learning algorithm lightgbm
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

In [11]:
print("start fit.")
folds = 10 #Divide the data into 10 parts
y=train_df['power']
X=train_df.drop(['power'],axis=1)

train_RMSE=[]
valid_RMSE=[]
# Store the list of learned models
models = []

#Shuffle the data set randomly and divide it into folds
kf = KFold(n_splits=folds, shuffle=True, random_state=2023)

#Divide x_train into a training set and a verification set in a ratio of 9:1, and take out the subscripts
for train_index, valid_index in kf.split(X):

    #Get the data of the training set and validation set based on the subscripts
    x_train_cv = X.iloc[train_index]
    y_train_cv = y.iloc[train_index]
    x_valid_cv =X.iloc[valid_index]
    y_valid_cv = y.iloc[valid_index]

    #Call the LightGBM regression model and add parameters
    model = LGBMRegressor(colsample_bytree=0.6503468706312049,
              learning_rate=0.020010277043886332, max_bin=127,
              min_child_samples=8, n_estimators=1001,
              early_stopping_rounds = 100, #If the accuracy has not improved after training for 100 times, stop training.
              num_leaves=402,reg_alpha=0.01631050699150689,
              reg_lambda=0.01300300057057842,verbose=-1)


    #Use x_train_cv to train the model, and use x_train_cv and x_valid_cv to evaluate together
    model.fit(
        x_train_cv,
        y_train_cv,
        eval_set = [(x_train_cv, y_train_cv), (x_valid_cv, y_valid_cv)],
        #verbose = 100, #Iterate 100 times and output a result
    )

    #Predict the training set
    y_pred_train = model.predict(x_train_cv, num_iteration=model.best_iteration_)
    #Predict on the validation set
    y_pred_valid = model.predict(x_valid_cv, num_iteration=model.best_iteration_)

    train_rmse=RMSE(y_pred_train,y_train_cv)
    valid_rmse=RMSE(y_pred_valid,y_valid_cv)

    train_RMSE.append(train_rmse)
    valid_RMSE.append(valid_rmse)
    #Save model into list
    models.append(model)
    print(f"train_RMSE:{train_RMSE},valid_RMSE:{valid_RMSE}")

train_RMSE=np.array(train_RMSE)
valid_RMSE=np.array(valid_RMSE)

print(f"mean_train_RMSE:{np.mean(train_RMSE)}")
print(f"mean_valid_RMSE:{np.mean(valid_RMSE)}")

start fit.
train_RMSE:[23.194480956562046],valid_RMSE:[31.474384618192367]
train_RMSE:[23.194480956562046, 23.212277481245703],valid_RMSE:[31.474384618192367, 31.655733641282897]
train_RMSE:[23.194480956562046, 23.212277481245703, 23.142818149800085],valid_RMSE:[31.474384618192367, 31.655733641282897, 31.943070837266777]
train_RMSE:[23.194480956562046, 23.212277481245703, 23.142818149800085, 23.168241227707266],valid_RMSE:[31.474384618192367, 31.655733641282897, 31.943070837266777, 30.45274609680333]
train_RMSE:[23.194480956562046, 23.212277481245703, 23.142818149800085, 23.168241227707266, 23.19372150329578],valid_RMSE:[31.474384618192367, 31.655733641282897, 31.943070837266777, 30.45274609680333, 30.962065577803127]
train_RMSE:[23.194480956562046, 23.212277481245703, 23.142818149800085, 23.168241227707266, 23.19372150329578, 23.17651734467083],valid_RMSE:[31.474384618192367, 31.655733641282897, 31.943070837266777, 30.45274609680333, 30.962065577803127, 31.28622114690404]
train_RMSE:[

In [12]:
test_X=test_df.drop(['power'],axis=1).values
#Use each saved model to predict x_test once, and then take the average
preds_test = []

for model in models:

    pred = model.predict(test_X, num_iteration=model.best_iteration_)

    preds_test.append(pred)

#Convert the prediction results into np.array
preds_test_np = np.array(preds_test)
#Average the prediction results of each model by column
test_pred= preds_test_np.mean(axis=0)
test_pred=np.where(test_pred<=0,0,test_pred)
test_pred[:50]

array([0.00000000e+00, 4.55014405e-01, 6.24000967e+00, 7.20728026e+00,
       6.18965131e-01, 4.17385249e-01, 2.34421971e+00, 3.58776790e+00,
       2.72461383e+00, 3.70978294e+00, 4.83750517e+00, 5.75435185e+00,
       9.30031335e+00, 7.65141756e+00, 8.54818668e+00, 1.34758436e+01,
       6.59925831e+01, 2.31452822e+02, 4.23995150e+02, 5.73849239e+02,
       3.52254583e+02, 1.60894171e+02, 1.51045073e+01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.15682547e-01, 7.19736897e-01,
       0.00000000e+00, 0.00000000e+00, 3.93235257e-03, 1.15068104e+00,
       5.65058446e-01, 2.18732613e+00, 2.88797286e+00, 5.30516938e+00,
       8.76200833e-01, 5.95387748e+00, 9.68250408e+00, 7.77868637e+00,
       5.29908599e+01, 2.25671497e+02, 4.02227661e+02, 5.75869684e+02,
       3.83619157e+02, 1.91170755e+02, 2.57700688e+01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00])

In [13]:
submission=pd.read_csv("sample_submission.csv")
submission['power']=test_pred
submission.to_csv("baseline.csv",index=None)
submission.head()

Unnamed: 0,id,power
0,8401,0.0
1,8402,0.455014
2,8403,6.24001
3,8404,7.20728
4,8405,0.618965


In [16]:
import torch
from torch.utils.data import DataLoader, TensorDataset

In [51]:
X_train_tensor.shape

torch.Size([263160, 86])

In [76]:
y_train_tensor.shape

torch.Size([263160])

In [57]:
X_train_tensor = torch.Tensor(train_df.drop(['power'],axis=1).values.astype(np.float32))
y_train_tensor = torch.Tensor(train_df['power'].values.astype(np.float32))
# y_train_tensor = y_train_tensor.unsqueeze(dim=0)

X_test_tensor = torch.Tensor(test_df.drop(['power'],axis=1).values.astype(np.float32))
y_test_tensor = torch.Tensor(test_df['power'].values.astype(np.float32))
# y_test_tensor = y_test_tensor.unsqueeze(dim=0)

In [72]:
batch_size = 32

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [28]:
import torch.nn as nn
import torch.optim as optim

In [35]:
device = "cpu"

In [73]:
class RegressionModel(nn.Module):
    def __init__(self, input_features, hidden_units, output_size):
        super().__init__()
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(in_features=input_features, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=output_size),
        )

    def forward(self, x):
        return self.linear_layer_stack(x)

In [74]:
NN_model = RegressionModel(input_features= X_train_tensor.shape[1],
                    output_size=1,
                    hidden_units= 32).to(device)

In [64]:
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(NN_model.parameters(), lr=0.001)

In [99]:
NN_model.train()
for inputs, targets in train_loader:
    output = NN_model(inputs)
    print(output.shape)
    targets = targets.unsqueeze(dim=1)
    print(targets.shape)
    loss = criterion(output, targets)
    print(loss.item())
    break

torch.Size([1, 32])
torch.Size([32, 86])
torch.Size([32])


In [100]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    NN_model.train()
    for inputs, targets in train_loader:
        targets = targets.unsqueeze(dim=1)
        outputs = NN_model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluation on the test set using RMSE
NN_model.eval()
with torch.no_grad():
    rmse_sum = 0.0
    num_samples = 0
    for inputs, targets in train_loader:
        targets = targets.unsqueeze(dim=1)
        test_outputs = NN_model(inputs)
        test_loss = criterion(test_outputs, targets)
        print(test_loss)
        rmse_sum += torch.sqrt(test_loss).item()  # Take the square root for RMSE
        # rmse_sum += test_loss
        num_samples += len(targets)

rmse_test = rmse_sum / num_samples
print(f'Root Mean Squared Error on Test Set: {rmse_test}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


KeyboardInterrupt: ignored