# Data manipulation
includes importing the data and feature engineering

In [1]:
#Library for importing csv files
import pandas as pd
#Library for matrix operations
import numpy as np
#Set random seeds to ensure that the model can be reproduced
import random
np.random.seed(42)
random.seed(42)

## importing training data and test data

- important note: the the test dataframe refers to the dataset which we want to predict the values and measure the RMSE, its not involved in the training and test phase. It is used after the traing is done.

In [2]:
# importing data which is in .csv format
train_df=pd.read_csv("train_df.csv")
print(f"len(train_df):{len(train_df)}")
test_df=pd.read_csv("test_df.csv")
print(f"len(test_df):{len(test_df)}")

len(train_df):329304
len(test_df):17136


In [3]:
# concatinating the training and test data frames for futher data manipulation
total_df=pd.concat((train_df,test_df),axis=0)
# dropping f1 and f2 columns in the dataset since it is not informative
total_df.drop(["f1","f2"], axis=1,inplace= True)
print(total_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346440 entries, 0 to 17135
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            346440 non-null  int64  
 1   id_encode     346440 non-null  int64  
 2   hour          346440 non-null  int64  
 3   parking_free  346440 non-null  int64  
 4   year          346440 non-null  int64  
 5   month         346440 non-null  int64  
 6   day           346440 non-null  int64  
 7   power         329304 non-null  float64
dtypes: float64(1), int64(7)
memory usage: 23.8 MB
None


In [4]:
#forward_fill: Missing values ​​are filled according to the previous value
total_df.fillna(method='ffill', inplace=True)
total_df["day_sin"] = np.sin(total_df["day"]*(2*np.pi / 30))
total_df["day_cos"] = np.cos(total_df["day"]*(2*np.pi / 30))
total_df['sin_month']=np.sin(2*np.pi*total_df['month']/12)
total_df['cos_month']=np.cos(2*np.pi*total_df['month']/12)
total_df['sin_hour']=np.sin(2*np.pi*total_df['hour']/24)
total_df['cos_hour']=np.cos(2*np.pi*total_df['hour']/24)
total_df.drop(columns= ["day", "month","hour"], axis=1, inplace=True)

In [5]:
total_df.drop(columns=["id", "year", "parking_free"], inplace= True)
print(total_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346440 entries, 0 to 17135
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id_encode  346440 non-null  int64  
 1   power      346440 non-null  float64
 2   Day sin    346440 non-null  float64
 3   Day cos    346440 non-null  float64
 4   sin month  346440 non-null  float64
 5   cos month  346440 non-null  float64
 6   sin_hour   346440 non-null  float64
 7   cos_hour   346440 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 23.8 MB
None


## Train and test Data Frames
extracing test_df and train_df from our total_df with new features

In [6]:
main_df=total_df[:len(train_df)]
prediction_df=total_df[len(train_df):]

print(f"train set length : {len(main_df)}\nprediction set length: {len(prediction_df)}")

train set length : 329304
prediction set length: 17136


## normalizing the power column
Normalizing using the Standard Deviation Normalization method

In [7]:
main_df_power_mean = main_df["power"].mean()
main_df_power_std = main_df["power"].std()

In [8]:
main_df["power"] = (main_df["power"] - main_df_power_mean) / main_df_power_std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["power"] = (main_df["power"] - main_df_power_mean) / main_df_power_std


# Gradient Boosting Ensemble Method
using LGBMRegressor for predictions

In [9]:
from sklearn.model_selection import KFold #Import the k-fold cross validation function in the machine learning library
from lightgbm import LGBMRegressor #Import the integrated learning algorithm lightgbm
import lightgbm as lgb
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

## Tunning Hyperparameters
Tuning hypereparameters of the lgm model using bayesian optimization

In [None]:
!pip install scikit-learn
!pip install scikit-optimize
!pip install matplotlib
!pip install bayesian-optimization

In [11]:
## importing libraries
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split

# laoding dataset
y = main_df['power']
X = main_df.drop(['power'],axis=1)

# defining the optimization function for the bayesian Optimization
def lgbm_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda):
    params = {
        'num_leaves': int(num_leaves),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1
    }

    # Set up k-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Create and train the LGBMRegressor model
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train)

        # Evaluate the model on the validation set
        score = model.score(X_val, y_val)
        scores.append(score)

    # Calculate the mean score across folds
    mean_score = np.mean(scores)

    return mean_score


In [12]:
# Define the search space for hyperparameters
pbounds = {
    'num_leaves': (50, 200),
    'learning_rate': (0.01, 0.3),
    'n_estimators': (50, 200),
    'subsample': (0.8, 1.0),
    'colsample_bytree': (0.8, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
}

# Initialize BayesianOptimization
lgbm_bo = BayesianOptimization(f=lgbm_cv, pbounds=pbounds, random_state=42)

# Perform Bayesian Optimization
init_points = 10
n_iter = 30
lgbm_bo.maximize(init_points=init_points, n_iter=n_iter)

# Get the best hyperparameters
best_params = lgbm_bo.max['params']
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['n_estimators'] = int(best_params['n_estimators'])

print("Best Hyperparameters:", best_params)

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 263443, number of used features: 7
[LightGBM] [Info] Start training from score 0.000563
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 263443, number of used features: 7
[LightGBM] [Info] Start training from score -0.001119
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 263443, number of used features: 7
[LightGBM] [Info] Start training from score 0.000842
You can set `force_col_wise=true` to remove the overhead.
[LightGBM]

In [13]:
print("start fit....")
folds = 10 #Divide the data into 10 parts
y = main_df['power']
X = main_df.drop(['power'],axis=1)

train_RMSE=[]
valid_RMSE=[]

# Store the list of learned models
models = []

#Shuffle the data set randomly and divide it into folds
kf = KFold(n_splits=folds, shuffle=True, random_state=42)

#Divide x_train into a training set and a verification set in a ratio of 9:1, and take out the subscripts
for train_index, valid_index in kf.split(X):

    #Get the data of the training set and validation set based on the subscripts
    x_train_cv = X.iloc[train_index]
    y_train_cv = y.iloc[train_index]
    x_valid_cv =X.iloc[valid_index]
    y_valid_cv = y.iloc[valid_index]

    #Call the LightGBM regression model and add parameters
    model = LGBMRegressor(**best_params)

    #Use x_train_cv to train the model, and use x_train_cv and x_valid_cv to evaluate together
    model.fit(
        x_train_cv,
        y_train_cv,
        eval_set = [(x_train_cv, y_train_cv), (x_valid_cv, y_valid_cv)],
        #verbose = 100, #Iterate 100 times and output a result
    )

    #Predict the training set
    y_pred_train = model.predict(x_train_cv, num_iteration=model.best_iteration_)
    #Predict on the validation set
    y_pred_valid = model.predict(x_valid_cv, num_iteration=model.best_iteration_)

    train_rmse=RMSE(y_pred_train,y_train_cv)
    valid_rmse=RMSE(y_pred_valid,y_valid_cv)

    train_RMSE.append(train_rmse)
    valid_RMSE.append(valid_rmse)
    #Save model into list
    models.append(model)
    #print(f"train_RMSE:{train_RMSE},valid_RMSE:{valid_RMSE}")

train_RMSE=np.array(train_RMSE)
valid_RMSE=np.array(valid_RMSE)

print(f"mean_train_RMSE:{np.mean(train_RMSE)}")
print(f"mean_valid_RMSE:{np.mean(valid_RMSE)}")

start fit....
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 296373, number of used features: 7
[LightGBM] [Info] Start training from score -0.000389
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 296373, number of used features: 7
[LightGBM] [Info] Start training from score 0.000891
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 296373, number of used features: 7
[LightGBM] [Info] Start training from score -0.000059
You can set `force_row_wise=true` to remove the overhead.
And if memory is not en

In [17]:
test_X = prediction_df.drop(['power'],axis=1).values
#Use each saved model to predict x_test once, and then take the average
preds_test = []

for model in models:

    pred = model.predict(test_X, num_iteration=model.best_iteration_)

    preds_test.append(pred)

# Reverse the normalization
original_predictions = preds_test * total_df["power"].std() + total_df["power"].mean()
#Convert the prediction results into np.array
preds_test_np = np.array(original_predictions)
#Average the prediction results of each model by column
test_pred= preds_test_np.mean(axis=0)
test_pred=np.where(test_pred<=0,0,test_pred)
test_pred[:50]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.17034853, 2.32652798, 5.178213  ,
       3.99238684, 1.38790999, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.18549959, 2.44841415, 5.42407367, 4.24353637,
       1.5783384 , 0.        , 0.        , 0.        , 0.        ])

In [18]:
submission=pd.read_csv("sample_submission.csv")
submission['power']=test_pred
submission.to_csv("baseline.csv",index=None)
submission.head()

Unnamed: 0,id,power
0,8401,0.0
1,8402,0.0
2,8403,0.0
3,8404,0.0
4,8405,0.0


# Neural Network model
Using a Multi Layer model for predection

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
torch.manual_seed(42)

<torch._C.Generator at 0x7f0dd832cbb0>

## Train and Test split

In [None]:
train_test_ratio = 0.8
train_dataset = main_df[ : int(len(train_df)*train_test_ratio)]
test_dataset = main_df[int(len(train_df)*train_test_ratio) : ]

In [None]:
X_train_tensor = torch.Tensor(train_dataset.drop(['power'],axis=1).values.astype(np.float32))
y_train_tensor = torch.Tensor(train_dataset['power'].values.astype(np.float32))

X_test_tensor = torch.Tensor(test_dataset.drop(['power'],axis=1).values.astype(np.float32))
y_test_tensor = torch.Tensor(test_dataset['power'].values.astype(np.float32))

In [None]:
X_train_tensor.shape , y_train_tensor.shape

(torch.Size([210528, 7]), torch.Size([210528]))

In [None]:
batch_size = 128

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import torch.nn as nn
import torch.optim as optim
device = "cpu"

In [None]:
class RegressionModel(nn.Module):
    def __init__(self, input_features, hidden_units, output_size):
        super().__init__()
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(in_features=input_features, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=4),
            nn.ReLU(),
            nn.Linear(in_features=4, out_features=output_size),
        )

    def forward(self, x):
        return self.linear_layer_stack(x)

In [None]:
nn_model = RegressionModel(input_features= X_train_tensor.shape[1],
                    output_size=1,
                    hidden_units= 8).to(device)

In [None]:
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.AdamW(nn_model.parameters(), lr=0.02)

In [None]:
## try a single run on the model to check the output shapes
nn_model.train()
for inputs, targets in train_loader:
    output = nn_model(inputs)
    print("output shape = ", output.shape)
    targets = targets.unsqueeze(dim=1)
    print("targets shape = ", targets.shape)
    loss = criterion(output, targets)
    print("loss value = ", loss.item())
    break

output shape =  torch.Size([128, 1])
targets shape =  torch.Size([128, 1])
loss value =  1.1553386449813843


In [None]:
# Training loop
num_epochs = 10

for epoch in range(num_epochs):
  print(f"Epoch: {epoch}\n-------")

  nn_model.train()
  ### Training
  train_loss = 0
  for batch, (inputs, targets) in enumerate(train_loader):
    targets = targets.unsqueeze(dim=1)
    # 1. Forward pass
    outputs = nn_model(inputs)
    # 2. Calculate loss (per batch)
    loss = torch.sqrt(criterion(outputs, targets))
    train_loss += loss # accumulatively add up the loss per epoch
    # 3. Optimizer zero grad
    optimizer.zero_grad()
    # 4. Loss backward
    loss.backward()
    # 5. Optimizer step
    optimizer.step()
    #if batch % 400 == 0:
      #print(f"Looked at {batch * len(inputs)}/{len(train_loader.dataset)} samples")

  # Divide total train loss by length of train dataloader (average loss per batch per epoch)
  train_loss /= len(train_loader)
  ### Testing
  # Setup variables for accumulatively adding up loss and accuracy
  test_loss = 0
  nn_model.eval()
  with torch.inference_mode():
    rmse_sum = 0.0
    num_samples = 0
    for inputs, targets in test_loader:
      targets = targets.unsqueeze(dim=1)
      # 1. Forward pass
      test_outputs = nn_model(inputs)
      # 2. Calculate loss (accumatively)
      test_loss += torch.sqrt(criterion(test_outputs, targets))

    # Divide total test loss by length of test dataloader (per epoch)
    test_loss /= len(test_loader)
  ## Print out what's happening
  print(f"\nTrain loss: {train_loss:.5f} | Test loss: {test_loss:.5f}")


Epoch: 0
-------

Train loss: 0.79628 | Test loss: 0.98649
Epoch: 1
-------

Train loss: 0.75656 | Test loss: 1.03925
Epoch: 2
-------

Train loss: 0.74396 | Test loss: 1.05232
Epoch: 3
-------

Train loss: 0.73867 | Test loss: 1.13611
Epoch: 4
-------

Train loss: 0.73675 | Test loss: 1.07861
Epoch: 5
-------

Train loss: 0.73654 | Test loss: 1.07169
Epoch: 6
-------

Train loss: 0.73681 | Test loss: 1.16946
Epoch: 7
-------

Train loss: 0.73753 | Test loss: 1.05498
Epoch: 8
-------

Train loss: 0.73740 | Test loss: 1.14926
Epoch: 9
-------

Train loss: 0.73784 | Test loss: 1.04795
