# Data manipulation
includes importing the data and feature engineering

In [None]:
!pip install scikit-learn
!pip install scikit-optimize
!pip install matplotlib
!pip install bayesian-optimization

In [2]:
#Library for importing csv files
import pandas as pd
#Library for matrix operations
import numpy as np
#Set random seeds to ensure that the model can be reproduced
import random
np.random.seed(42)
random.seed(42)

## importing training data and test data

- important note: the the test dataframe refers to the dataset which we want to predict the values and measure the RMSE, its not involved in the training and test phase. It is used after the traing is done.

In [3]:
# importing data which is in .csv format
train_df=pd.read_csv("train_df.csv")
print(f"len(train_df):{len(train_df)}")
test_df=pd.read_csv("test_df.csv")
print(f"len(test_df):{len(test_df)}")

len(train_df):329304
len(test_df):17136


In [4]:
# concatinating the training and test data frames for futher data manipulation
total_df=pd.concat((train_df,test_df),axis=0)
# dropping f1 and f2 columns in the dataset since it is not informative
total_df.drop(["f1","f2"], axis=1,inplace= True)
print(total_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346440 entries, 0 to 17135
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            346440 non-null  int64  
 1   id_encode     346440 non-null  int64  
 2   hour          346440 non-null  int64  
 3   parking_free  346440 non-null  int64  
 4   year          346440 non-null  int64  
 5   month         346440 non-null  int64  
 6   day           346440 non-null  int64  
 7   power         329304 non-null  float64
dtypes: float64(1), int64(7)
memory usage: 23.8 MB
None


In [5]:
#forward_fill: Missing values ​​are filled according to the previous value
total_df.fillna(method='ffill', inplace=True)
total_df["day_sin"] = np.sin(total_df["day"]*(2*np.pi / 30))
total_df["day_cos"] = np.cos(total_df["day"]*(2*np.pi / 30))
total_df['sin_month']=np.sin(2*np.pi*total_df['month']/12)
total_df['cos_month']=np.cos(2*np.pi*total_df['month']/12)
total_df['sin_hour']=np.sin(2*np.pi*total_df['hour']/24)
total_df['cos_hour']=np.cos(2*np.pi*total_df['hour']/24)
total_df.drop(columns= ["day", "month","hour"], axis=1, inplace=True)

In [6]:
total_df.drop(columns=["id", "year", "parking_free"], inplace= True)
print(total_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346440 entries, 0 to 17135
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id_encode  346440 non-null  int64  
 1   power      346440 non-null  float64
 2   day_sin    346440 non-null  float64
 3   day_cos    346440 non-null  float64
 4   sin_month  346440 non-null  float64
 5   cos_month  346440 non-null  float64
 6   sin_hour   346440 non-null  float64
 7   cos_hour   346440 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 23.8 MB
None


## Train and test Data Frames
extracing test_df and train_df from our total_df with new features

In [7]:
main_df=total_df[:len(train_df)]
prediction_df=total_df[len(train_df):]
print(f"train set length : {len(main_df)}\nprediction set length: {len(prediction_df)}")

train set length : 329304
prediction set length: 17136


## normalizing the power column
Normalizing using the Standard Deviation Normalization method

In [8]:
main_df_power_mean = main_df["power"].mean()
main_df_power_std = main_df["power"].std()

In [9]:
main_df.loc[:, "power"] = (main_df["power"] - main_df_power_mean) / main_df_power_std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df.loc[:, "power"] = (main_df["power"] - main_df_power_mean) / main_df_power_std


# Gradient Boosting Ensemble Method
using LGBMRegressor for predictions

In [10]:
from sklearn.model_selection import KFold #Import the k-fold cross validation function in the machine learning library
from lightgbm import LGBMRegressor #Import the integrated learning algorithm lightgbm
import lightgbm as lgb
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

## Tunning Hyperparameters
Tuning hypereparameters of the lgm model using bayesian optimization

In [11]:
## importing libraries
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split

# laoding dataset
y = main_df['power']
X = main_df.drop(['power'],axis=1)

# defining the optimization function for the bayesian Optimization
def lgbm_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda):
    params = {
        'num_leaves': int(num_leaves),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'random_state': 42,
        'metric': 'RMSE',
        'n_jobs': -1
    }

    # Set up k-fold cross-validation
    kf = KFold(n_splits=2, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Create and train the LGBMRegressor model
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train)

        # Evaluate the model on the validation set
        score = model.score(X_val, y_val)
        scores.append(score)

    # Calculate the mean score across folds
    mean_score = np.mean(scores)

    return mean_score


In [12]:
# Define the search space for hyperparameters
parameter_bounds = {
    'num_leaves': (100, 400),
    'learning_rate': (0.001, 0.1),
    'n_estimators': (100, 500),
    'subsample': (0.8, 1.0),
    'colsample_bytree': (0.8, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
}

# Initialize BayesianOptimization
lgbm_bo = BayesianOptimization(f=lgbm_cv, pbounds=parameter_bounds, random_state=42)

# Perform Bayesian Optimization
init_points = 10
n_iter = 50
lgbm_bo.maximize(init_points=init_points, n_iter=n_iter)

# Get the best hyperparameters
best_params = lgbm_bo.max['params']
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['n_estimators'] = int(best_params['n_estimators'])

print("Best Hyperparameters:", best_params)

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 164652, number of used features: 7
[LightGBM] [Info] Start training from score -0.001562
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 164652, number of used features: 7
[LightGBM] [Info] Start training from score 0.001562
| [0m1        [0m | [0m0.8572   [0m | [0m0.8749   [0m | [0m0.09512  [0m | [0m392.8    [0m | [0m279.6    [0m | [0m0.156    [0m | [0m0.156    [0m | [0m0.8116   [0m |
You can set `force_row_wise=true` to remove the overhead.
A

In [13]:
print("start fit....")
folds = 10 #Divide the data into 10 parts
y = main_df['power']
X = main_df.drop(['power'],axis=1)

train_RMSE=[]
valid_RMSE=[]

# Store the list of learned models
models = []

#Shuffle the data set randomly and divide it into folds
kf = KFold(n_splits=folds, shuffle=True, random_state=42)
# Add the early_stopping_rounds parameter
best_params['early_stopping_rounds'] = 100

#Divide x_train into a training set and a verification set in a ratio of 9:1, and take out the subscripts
for train_index, valid_index in kf.split(X):

    #Get the data of the training set and validation set based on the subscripts
    x_train_cv = X.iloc[train_index]
    y_train_cv = y.iloc[train_index]
    x_valid_cv =X.iloc[valid_index]
    y_valid_cv = y.iloc[valid_index]

    #Call the LightGBM regression model and add parameters
    model = LGBMRegressor(**best_params)

    #Use x_train_cv to train the model, and use x_train_cv and x_valid_cv to evaluate together
    model.fit(
        x_train_cv,
        y_train_cv,
        eval_set = [(x_train_cv, y_train_cv), (x_valid_cv, y_valid_cv)],
        #verbose = 100, #Iterate 100 times and output a result
    )

    #Predict the training set
    y_pred_train = model.predict(x_train_cv, num_iteration=model.best_iteration_)
    #Predict on the validation set
    y_pred_valid = model.predict(x_valid_cv, num_iteration=model.best_iteration_)

    train_rmse=RMSE(y_pred_train,y_train_cv)
    valid_rmse=RMSE(y_pred_valid,y_valid_cv)

    train_RMSE.append(train_rmse)
    valid_RMSE.append(valid_rmse)
    #Save model into list
    models.append(model)
    #print(f"train_RMSE:{train_RMSE},valid_RMSE:{valid_RMSE}")

train_RMSE=np.array(train_RMSE)
valid_RMSE=np.array(valid_RMSE)

print(f"mean_train_RMSE:{np.mean(train_RMSE)}")
print(f"mean_valid_RMSE:{np.mean(valid_RMSE)}")

start fit....
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 296373, number of used features: 7
[LightGBM] [Info] Start training from score -0.000389
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[417]	training's l2: 0.105273	valid_1's l2: 0.133162
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 296373, number of used features: 7
[LightGBM] [Info] Start training from score 0.000891
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[416]	training's l2: 0.105357	valid_1's l2: 0.130291
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

In [14]:
test_X = prediction_df.drop(['power'],axis=1).values
#Use each saved model to predict x_test once, and then take the average
preds_test = []

for model in models:

    pred = model.predict(test_X, num_iteration=model.best_iteration_)

    preds_test.append(pred)

# Reverse the normalization
original_predictions = np.array(preds_test) * main_df_power_std + main_df_power_mean
#Convert the prediction results into np.array
preds_test_np = np.array(original_predictions)
#Average the prediction results of each model by column
test_pred= preds_test_np.mean(axis=0)
test_pred=np.where(test_pred<=0,0,test_pred)
test_pred[:50]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 5.58791040e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.06586236e+00,
       3.86919772e+01, 2.13029429e+02, 3.51593533e+02, 6.72190411e+02,
       5.43425289e+02, 2.33160258e+02, 1.85291068e+01, 0.00000000e+00,
       3.42970776e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.56439283e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 7.63837635e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.56318779e+00,
       4.09465130e+01, 2.14847107e+02, 3.62472044e+02, 6.94458088e+02,
       5.66678035e+02, 2.52548719e+02, 2.37140593e+01, 4.81879064e-01,
       7.09819427e-01, 0.00000000e+00])

In [15]:
submission=pd.read_csv("sample_submission.csv")
submission['power']=test_pred
submission.to_csv("baseline.csv",index=None)
submission.head()

Unnamed: 0,id,power
0,8401,0.0
1,8402,0.0
2,8403,0.0
3,8404,0.0
4,8405,0.0
