In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from matplotlib.patches import ConnectionPatch


In [122]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_sub = pd.read_csv("sample_submission.csv")

df_train = df_train.drop(columns="ID")
df_test = df_test.drop(columns="ID")

In [123]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# from statsmodels.tools.tools import add_constant


# df_train_with_const = add_constant(df_train) 


# vif_data = pd.DataFrame()
# vif_data["feature"] = df_train_with_const.columns
# vif_data["VIF"] = [variance_inflation_factor(df_train_with_const.values, i) for i in range(df_train_with_const.shape[1])]

# print(vif_data)

In [124]:
# vif_data = vif_data[vif_data['feature'] != 'const']
# plt.figure(figsize=(10, 6))
# plt.barh(vif_data['feature'], vif_data['VIF'], color='skyblue')
# plt.xlabel('VIF')
# plt.ylabel('Feature')
# plt.title('Variance Inflation Factor (VIF)')
# plt.grid(axis='x')
# plt.show()

In [125]:
# numeric_columns = df_train.select_dtypes(include=['float64', 'int64'])

# def dist(train_dataset, columns_list, rows, cols):
#     fig, axs = plt.subplots(rows, cols, figsize=(40, 20))
#     axs = axs.flatten()
#     for i, col in enumerate(columns_list):
#         sns.kdeplot(train_dataset[col], ax=axs[i], fill=True, alpha=0.5, linewidth=0.5, color='#058279', label='Train')
#         axs[i].set_title(f'{col}, Train skewness: {train_dataset[col].skew():.2f}')
#         axs[i].legend()
#         axs[i].axis('off')
#         axs[i].set_xticks([])
#         axs[i].set_yticks([])
#         median_train = train_dataset[col].median()
#         axs[i].axvline(x=median_train, color='#4caba4', linestyle='--')
#         axs[i].legend(labels=['Train', 'Median'])
        
#     fig.suptitle('Distribution of Numeric Columns', fontsize=30)
#     plt.tight_layout()
#     sns.despine(left=True, bottom=True) 

# dist(train_dataset=df_train, columns_list=numeric_columns.columns, rows=4, cols=6)

In [126]:
X = df_train
y = df_train["y"]

In [127]:
from scipy.stats import entropy
from scipy.signal import welch,find_peaks


def count_peaks(row):
    peaks, _ = find_peaks(row)
    return len(peaks)

def spectral_entropy(row):
    _, psd = welch(row)
    return entropy(psd)

def cleaning(dataset):
    
    features = dataset.columns.tolist() 
    dataset['mean_features'] = 0.1 * dataset[features].mean(axis=1)
    dataset['std_features'] = dataset[features].std(axis=1)
    dataset['max_features'] = dataset[features].max(axis=1)
    dataset['min_features'] = dataset[features].min(axis=1)
    dataset['range_features'] = dataset['max_features'] - dataset['min_features']
    dataset['variance_features'] = dataset[features].var(axis=1)
    dataset['skewness_features'] = dataset[features].skew(axis=1)
    dataset['sum_features'] = dataset[features].sum(axis=1)

   
    mean_abs_dev = (dataset[features] - dataset[features].mean(axis=1).values.reshape(-1, 1)).abs().mean(axis=1)
    median_abs_dev = (dataset[features] - dataset[features].median(axis=1).values.reshape(-1, 1)).abs().mean(axis=1)
    range_abs_diff = (dataset[features] - dataset[features].median(axis=1).values.reshape(-1, 1)).abs().max(axis=1) - (dataset[features] - dataset[features].median(axis=1).values.reshape(-1, 1)).abs().min(axis=1)
    geometric_mean = np.exp(np.log(dataset[features].replace(0, 1)).mean(axis=1))
    harmonic_mean = len(features) / (1 / dataset[features].replace(0, 1)).sum(axis=1)
    coeff_variation = dataset['std_features'] / dataset['mean_features']

   
    quartiles = dataset[features].quantile([0.25, 0.5, 0.75], axis=1)
    dataset['first_quartile'] = quartiles.loc[0.25]
    dataset['second_quartile'] = quartiles.loc[0.5]
    dataset['third_quartile'] = quartiles.loc[0.75]

 
    kurtosis_features = dataset[features].kurtosis(axis=1)
    dataset['kurtosis_features'] = kurtosis_features

    dataset['mean_absolute_deviation'] = mean_abs_dev
    dataset['median_absolute_deviation'] = median_abs_dev
    dataset['range_abs_diff'] = range_abs_diff
    dataset['geometric_mean'] = geometric_mean
    dataset['harmonic_mean'] = harmonic_mean
    dataset['coeff_variation'] = coeff_variation
    
    # peak_features = dataset[features].apply(count_peaks, axis=1)
    # dataset['peak_frequency'] = peak_features
    
    # spectral_entropy_features = dataset[features].apply(spectral_entropy, axis=1)
    # dataset['spectral_entropy'] = spectral_entropy_features   


    # entropy_features = dataset[features].apply(entropy, axis=1)
    # dataset['entropy'] = entropy_features

    # dataset.drop(features, axis=1, inplace=True)

    return dataset

In [128]:
# df_train = cleaning(df_train)
# df_test = cleaning(df_test)

In [129]:
# X_train, X_valid = train_test_split(X, test_size=0.2, random_state=42)

In [130]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_train_data = df_train
scaled_test_data = df_train

In [131]:
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from xgboost import XGBRegressor



xgb_params ={'n_estimators':600,
             'max_depth': 10,
           
             'learning_rate': 0.06,
             'random_state':42,
             }
             

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

xgb_predictions = np.zeros(len(scaled_train_data))
xgb_true_labels = np.zeros(len(scaled_train_data))
xgb_test_predictions = np.zeros(len(scaled_test_data))

for fold, (train_idx, val_idx) in enumerate(kf.split(scaled_train_data, y)):
    X_train, X_val = scaled_train_data.iloc[train_idx], scaled_train_data.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    xgb_model = XGBRegressor(**xgb_params)

    xgb_model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  # early_stopping_rounds=10,
                  verbose=100)

    xgb_fold_preds = xgb_model.predict(X_val)
    xgb_fold_test_preds = xgb_model.predict(scaled_test_data)

    xgb_predictions[val_idx] = xgb_fold_preds
    xgb_true_labels[val_idx] = y_val
    xgb_test_predictions += xgb_fold_test_preds / n_splits  


overall_metric_xgb = r2_score(xgb_true_labels, xgb_predictions)
print("Overall R^2 (XGBRegressor):", overall_metric_xgb)

[0]	validation_0-rmse:2.53414	validation_1-rmse:2.51639


[100]	validation_0-rmse:0.02938	validation_1-rmse:0.18155
[200]	validation_0-rmse:0.01494	validation_1-rmse:0.18169
[300]	validation_0-rmse:0.01121	validation_1-rmse:0.18178
[400]	validation_0-rmse:0.00898	validation_1-rmse:0.18183
[500]	validation_0-rmse:0.00750	validation_1-rmse:0.18186
[599]	validation_0-rmse:0.00643	validation_1-rmse:0.18188
[0]	validation_0-rmse:2.53699	validation_1-rmse:2.50379
[100]	validation_0-rmse:0.02864	validation_1-rmse:0.18000
[200]	validation_0-rmse:0.01487	validation_1-rmse:0.18153
[300]	validation_0-rmse:0.01096	validation_1-rmse:0.18157
[400]	validation_0-rmse:0.00890	validation_1-rmse:0.18162
[500]	validation_0-rmse:0.00744	validation_1-rmse:0.18167
[599]	validation_0-rmse:0.00631	validation_1-rmse:0.18165
[0]	validation_0-rmse:2.51512	validation_1-rmse:2.59601
[100]	validation_0-rmse:0.03047	validation_1-rmse:0.27705
[200]	validation_0-rmse:0.01485	validation_1-rmse:0.25939
[300]	validation_0-rmse:0.01094	validation_1-rmse:0.25902
[400]	validation_0

In [132]:
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold


n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
catboost_params ={
             'random_state':42,
             'learning_rate': 0.011277016304363601, 
             'depth': 8, 
             'subsample': 0.8675506657380021, 
             'colsample_bylevel': 0.7183884158632279, 
             'min_data_in_leaf': 98
            }
catboost_predictions = np.zeros(len(scaled_train_data))
catboost_true_labels = np.zeros(len(scaled_train_data))
catboost_test_predictions = np.zeros(len(scaled_test_data))

for fold, (train_idx, val_idx) in enumerate(kf.split(scaled_train_data, y)):
    X_train, X_val = scaled_train_data.iloc[train_idx], scaled_train_data.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    catboost_model = CatBoostRegressor(**catboost_params)

    catboost_model.fit(X_train, y_train,
                       eval_set=(X_val, y_val),
                       early_stopping_rounds=10)

    catboost_fold_preds = catboost_model.predict(X_val)
    
    catboost_fold_test_preds = catboost_model.predict(scaled_test_data)

    catboost_predictions[val_idx] = catboost_fold_preds
    catboost_true_labels[val_idx] = y_val
    catboost_test_predictions += catboost_fold_test_preds / n_splits 

overall_metric_catboost = r2_score(catboost_true_labels, catboost_predictions)
print("Overall R^2 (CatBoostRegressor):", overall_metric_catboost)

0:	learn: 2.6653163	test: 2.6474844	best: 2.6474844 (0)	total: 11.4ms	remaining: 11.4s
1:	learn: 2.6386888	test: 2.6199350	best: 2.6199350 (1)	total: 19.4ms	remaining: 9.68s
2:	learn: 2.6129517	test: 2.5934152	best: 2.5934152 (2)	total: 121ms	remaining: 40.1s
3:	learn: 2.5873331	test: 2.5670880	best: 2.5670880 (3)	total: 127ms	remaining: 31.7s
4:	learn: 2.5611286	test: 2.5401678	best: 2.5401678 (4)	total: 138ms	remaining: 27.4s
5:	learn: 2.5356264	test: 2.5139236	best: 2.5139236 (5)	total: 144ms	remaining: 23.9s
6:	learn: 2.5095454	test: 2.4871057	best: 2.4871057 (6)	total: 162ms	remaining: 23s
7:	learn: 2.4845189	test: 2.4613165	best: 2.4613165 (7)	total: 170ms	remaining: 21.1s
8:	learn: 2.4600344	test: 2.4361503	best: 2.4361503 (8)	total: 177ms	remaining: 19.5s
9:	learn: 2.4357245	test: 2.4111001	best: 2.4111001 (9)	total: 184ms	remaining: 18.2s
10:	learn: 2.4116267	test: 2.3861762	best: 2.3861762 (10)	total: 188ms	remaining: 16.9s
11:	learn: 2.3876801	test: 2.3612551	best: 2.3612551

In [133]:
from lightgbm import LGBMRegressor

n_splits =5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Parametere derived from here https://www.kaggle.com/code/thiagomantuani/ps4e5-flood-prediction-get-started
# One can get this using optuna, gris, random or baysein technique as well which will take time
#https://www.kaggle.com/code/harshitstark/regression-with-a-flood-prediction-dataset/notebook

lgb_params = {
    'boosting_type': 'gbdt', 
    'n_estimators':1500, 
    'learning_rate' :  0.012,    
    'num_leaves' : 250, 
    'subsample_for_bin': 165700, 
    'min_child_samples': 114, 
    'reg_alpha': 2.075e-06, 
    'reg_lambda': 3.839e-07, 
    'colsample_bytree': 0.9634,
    'subsample': 0.9592, 
    'max_depth': 10,
    'random_state':0,
    'verbosity':-1}

lgbm_predictions = np.zeros(len(scaled_train_data))
lgbm_true_labels = np.zeros(len(scaled_train_data))
lgbm_test_predictions = np.zeros(len(scaled_test_data))

for fold, (train_idx, val_idx) in enumerate(kf.split(scaled_train_data, y)):
    X_train, X_val = scaled_train_data.iloc[train_idx], scaled_train_data.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgbm_model = LGBMRegressor(**lgb_params)

    lgbm_model.fit(X_train, y_train,
                   eval_set=[(X_val, y_val)],
                   eval_metric='rmse',)

    lgbm_fold_preds = lgbm_model.predict(X_val)
    lgbm_fold_test_preds = lgbm_model.predict(scaled_test_data)

    lgbm_predictions[val_idx] = lgbm_fold_preds
    lgbm_true_labels[val_idx] = y_val
    lgbm_test_predictions += lgbm_fold_test_preds / n_splits

overall_metric_lgbm = r2_score(lgbm_true_labels, lgbm_predictions)
print("Overall R^2 (LGBMRegressor):", overall_metric_lgbm)

Overall R^2 (LGBMRegressor): 0.9723686619696512


In [137]:
lgbm_test_predictions

array([83.42713567, 79.4335091 , 82.17745013, ..., 84.7518744 ,
       86.20738949, 83.69929013])

In [136]:
df_sub['y'] = (lgbm_test_predictions *0.7)[:4986] + (catboost_test_predictions*0.0 + xgb_test_predictions*0.3)[:4986]
df_sub.to_csv('Xgboost.csv', index=False)

  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_predictions*0.3[:,4985])
  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_predictions*0.3[:,4985])
  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_predictions*0.3[:,4985])
  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_predictions*0.3[:,4985])
  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_predictions*0.3[:,4985])
  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_predictions*0.3[:,4985])
  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_predictions*0.3[:,4985])
  df_sub['y'] = lgbm_test_predictions[:,4985] *0.7 + (catboost_test_predictions*0.0[:,4985] + xgb_test_p

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed