In [None]:
# Warning Libraries 
import warnings
warnings.filterwarnings("ignore")
# warnings.simplefilter(action='ignore', category=FutureWarning)

# Scientific and Data Manipulation Libraries 
import pandas as pd
import numpy as np



# Data Preprocessing, Machine Learning and Metrics Libraries 
from sklearn.preprocessing            import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.metrics                  import mean_squared_error



# from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, train_test_split, cross_val_score


# Boosting Algorithms 
from xgboost                          import XGBRegressor
from catboost                         import CatBoostRegressor
from lightgbm                         import LGBMRegressor

from sklearn.model_selection import GridSearchCV

In [None]:
# read data

train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# Looks at the first 5 rows of the Train and Test data
display('Train Head :',train.head())
display('Test Head :',test.head())


# Displays Information of Columns of Train and Test data
train.info()
test.info()


# Display Descriptive Statistics of Train and Test data
display('Train Description :',train.describe())
display('Test  Description :',test.describe())


# Displays Correlation between Features through HeatMap - Ligther Color means Higher Correlation
# sns.heatmap(train.corr(), annot = True)

In [None]:
# ML Dataset
X=train.drop(['loss','id'],axis=1)
y=train['loss']

# test dataset
test =  test.drop('id',axis=1)

# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns


ordinal_encoder = OrdinalEncoder()
X[categorical_ix] = ordinal_encoder.fit_transform(X[categorical_ix])
test[categorical_ix] = ordinal_encoder.transform(test[categorical_ix])

# one-hot encoding
# one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# X[categorical_ix] = one_hot_encoder.fit_transform(X[categorical_ix])
# test[categorical_ix] = one_hot_encoder.transform(test[categorical_ix])

# # Scaling features
scaler = RobustScaler() # StandardScaler()  RobustScaler()  MinMaxScaler() MaxAbsScaler()
X[numerical_ix] = scaler.fit_transform(X[numerical_ix])
test[numerical_ix] = scaler.transform(test[numerical_ix])

In [None]:
X.shape , y.shape, test.shape

In [None]:
# https://machinelearningmastery.com/out-of-fold-predictions-in-machine-learning/
# https://www.kaggle.com/miguelquiceno/30-days-kfold-xgboost#Modeling
# https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/


hyperparams = {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 3, 'learning_rate': 0.1,'loss_function':'RMSE','eval_metric':"RMSE", 'verbose' : False}

SEED = 299792458
N_ESTIMATORS = 10000
ctb_params = {
    'bootstrap_type': 'Poisson',
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': SEED,
    'task_type': 'GPU',
    'max_depth': 8,
    'learning_rate': 5e-3,
    'n_estimators': N_ESTIMATORS,
    'max_bin': 280,
    'min_data_in_leaf': 64,
    'l2_leaf_reg': 0.01,
    'subsample': 0.8,
    'verbose' : False
}
kf = KFold(n_splits=10, shuffle=True , random_state=42)

test_preds = 0
mean_rmse = 0
fold_preds = np.zeros((X.shape[0],))

for num, (train_id, valid_id) in enumerate(kf.split(X)):
  X_train, y_train = X.loc[train_id], y.loc[train_id]
  X_valid, y_valid = X.loc[valid_id], y.loc[valid_id]

  eval_set = [(X_valid, y_valid)]

  # model = CatBoostRegressor(eval_metric="RMSE", verbose = False)

  # Define the model 
#   model = XGBRegressor(n_estimators=2000, # 1375
#                        max_depth = 3,
#                        learning_rate=0.14, 
#                        olsample_bytree= 0.5,
#                        subsample=0.99, 
#                        random_state=1, reg_alpha = 25.4)
#   model.fit(X_train, y_train, early_stopping_rounds = 100, eval_set=eval_set, verbose=False)

#   model = CatBoostRegressor(verbose=False)
#   model.fit(X_train, y_train, eval_set=eval_set)

  model = LGBMRegressor(force_col_wise=True, verbose=0)
  model.fit(X_train, y_train, eval_set=eval_set)

  test_preds+=model.predict(test) / 10

  fold_preds[valid_id] = model.predict(X_valid)
  fold_rmse = mean_squared_error(y_valid, fold_preds[valid_id], squared=False)
  print(f"\n{num+1} --- RMSE : {fold_rmse}")

  mean_rmse += fold_rmse/10

print(f"Mean RMSE : {mean_rmse}")



In [None]:
sub.loss = test_preds
sub.head()

In [None]:
sub.to_csv("submission.csv", index=False)
sub.shape