<a href="https://www.kaggle.com/gavisr/wids-2022-lgbm-for-beginners?scriptVersionId=88815341" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Libraries üìö

In [None]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')

# Load Datasets üîÉ

In [None]:
INPUT = "../input/widsdatathon2022/"

train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")
df_submission = pd.read_csv(INPUT + "sample_solution.csv")
PSEUDO_PATH = "../input/lgbmsub/submission6.csv"

# Data Manipulation üî¢

In [None]:
ID = "id"
TARGET = "site_eui"

#train = df_train.drop([ID, TARGET], axis=1)
#test = df_test.drop( [ID], axis=1)
target = train.site_eui

print(f'Train set has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test set has {test.shape[0]} rows and {test.shape[1]} columns.') 



In [None]:
def getNewTrainDataFrame(TRAIN_PATH,TEST_PATH,PSEUDO_PATH,ID,TARGET):

    pseudo = pd.read_csv(PSEUDO_PATH)
    pseudo = pseudo.drop([ID],axis=1)

    pseudo_train = pd.concat([test,pseudo], axis=1)
    pseudo_train.reset_index(inplace=True, drop=True)
    
    new_train = pd.concat([train, pseudo_train], axis=0,ignore_index=True)
    
    return new_train

In [None]:
# pseudo labeling 
from shutil import copyfile

train = getNewTrainDataFrame(train,test,PSEUDO_PATH,ID,TARGET)

def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = "Missing"
                
checkNull_fillData(train)
checkNull_fillData(test)

In [None]:
# Function to seed everything
SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Encode label üîñ

In [None]:
le = LabelEncoder()

for i in train.columns:
    if train[i].dtypes == 'object':
        train[i] = le.fit_transform(train[i])
        print(i)

for i in test.columns:
    if test[i].dtypes == 'object':
        test[i] = le.fit_transform(test[i])
        print(i)

In [None]:
target = train[TARGET]

In [None]:
train = train.drop([ID, TARGET], axis=1)
test = test.drop( [ID], axis=1)

In [None]:
train.shape,target.shape

# Iterative Imputer üõ†

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state= int(SEED), max_iter=10, initial_strategy='mean')
train_im = pd.DataFrame(imputer.fit_transform(train))
test_im = pd.DataFrame(imputer.fit_transform(test))

train = train_im
test = test_im

# Build Model üë∑üèΩ‚Äç‚ôÄÔ∏è

In [None]:
import optuna  # pip install optuna

def objective(trial, train=train, target=target):
    X_train, X_eval, y_train, y_eval = train_test_split(train, target, test_size = 0.20, random_state = 42)
    print("Train/Eval Sizes : ", X_train.shape, X_eval.shape, y_train.shape, y_eval.shape)

    param = {
        'metric': 'rmse', 
        'random_state': 42,
        'n_estimators': 20000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 2000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
#         'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
                }
        
    lgb_model = lgb.LGBMRegressor(**param)
    lgb_model.fit(train,
                  target,
                  eval_set=[(X_eval, y_eval)],
                  eval_metric="rmse",
                  early_stopping_rounds=100,
                  verbose = 1000)

    preds = lgb_model.predict(X_eval)
    rmse = mean_squared_error(y_eval, preds, squared=False)
    return rmse

* direction = ‚Äòminimize‚Äô, since we want to minimize your rmse.
* show_progress_bar = True, gives a nice-looking progress bar.
* n_trails = 5, is the same as the epoch.
* sampler = TPESample(), Bayesian Sampling Technique

In [None]:
from optuna.samplers import TPESampler

study = optuna.create_study(direction="minimize",sampler=TPESampler(), study_name="LGBM1")
study.optimize(objective, n_trials=5, show_progress_bar = True)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
params1 = study.best_params
params1

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(train, target, test_size = 0.20, random_state = 42)
print("Train/Eval Sizes : ", X_train.shape, X_eval.shape, y_train.shape, y_eval.shape)
lgb_model = lgb.LGBMRegressor(**params1)

lgb_model.fit(train,
              target,
              eval_set=[(X_eval, y_eval)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose = 1000)

preds = lgb_model.predict(test)

# Make Prediction

In [None]:
lgb_predict = lgb_model.predict(test)
lgb_predict

In [None]:
# Make sure lengths are correct
len(lgb_predict), len(df_submission.site_eui)

# Prepare submission

In [None]:
df_submission.site_eui = lgb_predict

In [None]:
SUBMISSION_PATH = "submission.csv"
df_submission.to_csv(SUBMISSION_PATH,index=False)
# df_submission.head()