In [19]:
# Import Libraries
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

from umap import UMAP

import lightgbm as lgbm
from lightgbm import LGBMRegressor

from xgboost import XGBRegressor

import re



In [20]:
# Read in Data

data = pd.read_csv("data.csv", index_col='id')
data.columns =  [re.sub(' \[.*\]', '', c) for c in data.columns]


data_ext = pd.read_csv("original.csv", index_col='id')
data_ext.columns = [re.sub(' \[.*\]', '', c) for c in data_ext.columns]


target_col = 'x_e_out'

In [21]:
# Code is from Mykhailo Savchenko @shalfey https://www.kaggle.com/competitions/playground-series-s3e15/discussion/413826

author_geometry_mapping = { 'Inasaka': 'tube', 'Peskov': 'tube', 'Thompson': 'tube', 'Weatherhead': 'tube', 'Williams': 'tube', 'Beus': 'annulus', 'Janssen': 'annulus', 'Mortimore': 'annulus', 'Kossolapov': 'plate', 'Richenderfer': 'plate' }
data['geometry'] = data['geometry'].fillna(data['author'].map(author_geometry_mapping))

data.loc[(data['D_e'].isna() | data['D_h'].isna()) & (data['geometry'].isna()), 'geometry'] = 'tube'
data['D_e'] = data.apply(lambda row: row['D_h'] if row['geometry'] == 'tube' and pd.isna(row['D_e']) else row['D_e'], axis=1)
data['D_h'] = data.apply(lambda row: row['D_e'] if row['geometry'] == 'tube' and pd.isna(row['D_h']) else row['D_h'], axis=1)
data.loc[data['D_h'] == data['D_e'], 'geometry'] = 'tube'

author_list = ['Inasaka', 'Peskov', 'Thompson', 'Weatherhead', 'Williams']
filtered_rows = data[(data['author'].isin(author_list)) & (data['geometry'] != 'tube')]
data.loc[filtered_rows.index, 'geometry'] = 'tube'

In [22]:
#Data Pre-processing

#Inasaka
data.loc[(data['author'] == 'Inasaka') & ((data['D_h'] == 3.0) | (data['length'] == 100.0)) & (data['D_e'].isnull()), 'D_e'] = 3
data.loc[(data['author'] == 'Inasaka') & ((data['D_e'] == 3.0) | (data['length'] == 100.0)) & (data['D_h'].isnull()), 'D_h'] = 3
data.loc[(data['author'] == 'Inasaka') & ((data['D_e'] == 3.0) | (data['D_h'] == 3.0)) & (data['length'].isnull()), 'length'] = 100

#Kossolapov
data.loc[(data['author'] == 'Kossolapov') & ((data['D_h'] == 120.0) | (data['length'] == 10.0)) & (data['D_e'].isnull()), 'D_e'] = 15
data.loc[(data['author'] == 'Kossolapov') & ((data['D_e'] == 15.0) | (data['length'] == 10.0)) & (data['D_h'].isnull()), 'D_h'] = 120
data.loc[(data['author'] == 'Kossolapov') & ((data['D_e'] == 15.0) | (data['D_h'] == 120.0)) & (data['length'].isnull()), 'length'] = 10
data.loc[(data['author'] == 'Kossolapov') & (data['pressure'].isnull()), 'pressure'] = 0.1

#Mortimore
data.loc[(data['author'] == 'Mortimore') & ((data['D_h'] == 13.3) | (data['length'] == 2134.0)) & (data['D_e'].isnull()), 'D_e'] = 5
data.loc[(data['author'] == 'Mortimore') & ((data['D_e'] == 5.0) | (data['length'] == 2134.0)) & (data['D_h'].isnull()), 'D_h'] = 13.3
data.loc[(data['author'] == 'Mortimore') & ((data['D_e'] == 5.0) | (data['D_h'] == 13.3)) & (data['length'].isnull()), 'length'] = 2134

#Richenderfer
data.loc[(data['author'] == 'Richenderfer') & ((data['D_h'] == 120.0) | (data['length'] == 10.0)) & (data['D_e'].isnull()), 'D_e'] = 15
data.loc[(data['author'] == 'Richenderfer') & ((data['D_e'] == 15.0) | (data['length'] == 10.0)) & (data['D_h'].isnull()), 'D_h'] = 120
data.loc[(data['author'] == 'Richenderfer') & ((data['D_e'] == 15.0) | (data['D_h'] == 120.0)) & (data['length'].isnull()), 'length'] = 10

#Peskov
data.loc[(data['author'] == 'Peskov') & ((data['D_h'] == 10.0) | (data['geometry'] == 'tube')) & (data['D_e'].isnull()), 'D_e'] = 10
data.loc[(data['author'] == 'Peskov') & ((data['D_e'] == 10.0) | (data['geometry'] == 'tube')) & (data['D_h'].isnull()), 'D_h'] = 10
data.loc[(data['author'] == 'Peskov') & ((data['D_e'] == 10.0) | (data['D_h'] == 10.0)) & (data['geometry'].isnull()), 'geometry'] = 'tube'

#Williams
data.loc[(data['author'] == 'Williams') & ((data['D_h'] == 9.5) | (data['length'] == 1836)) & (data['D_e'].isnull()), 'D_e'] = 9.5
data.loc[(data['author'] == 'Williams') & ((data['D_e'] == 9.5) | (data['length'] == 1836)) & (data['D_h'].isnull()), 'D_h'] = 9.5
data.loc[(data['author'] == 'Williams') & ((data['D_e'] == 9.5) | (data['D_h'] == 9.5)) & (data['length'].isnull()), 'length'] = 1836
data.loc[(data['author'] == 'Williams') & (data['geometry'].isnull()), 'geometry'] = 'tube'

#Weatherhead
data.loc[(data['author'] == 'Weatherhead') & (data['D_h'] == 7.7) & (data['D_e'].isnull()), 'D_e'] = 7.7
data.loc[(data['author'] == 'Weatherhead') & (data['D_h'] == 11.1) & (data['D_e'].isnull()), 'D_e'] = 11.1
data.loc[(data['author'] == 'Weatherhead') & (data['D_e'] == 7.7) & (data['D_h'].isnull()), 'D_h'] = 7.7
data.loc[(data['author'] == 'Weatherhead') & (data['D_e'] == 11.1) & (data['D_h'].isnull()), 'D_h'] = 11.1
data.loc[(data['author'] == 'Weatherhead') & (data['geometry'].isnull()), 'geometry'] = 'tube'
data.loc[(data['author'] == 'Weatherhead') & ((data['D_h'] == 7.7) | (data['D_h'] == 11.1)) & (data['length'].isnull()), 'length'] = 457
data.loc[(data['author'] == 'Weatherhead') & (data['pressure'].isnull()), 'pressure'] = 13.79

#Beus
data.loc[(data['author'] == 'Beus') & ((data['D_h'] == 15.2) | (data['length'] == 2134.0)) & (data['D_e'].isnull()), 'D_e'] = 5.6
data.loc[(data['author'] == 'Beus') & ((data['D_e'] == 5.6) | (data['length'] == 2134.0)) & (data['D_h'].isnull()), 'D_h'] = 15.2
data.loc[(data['author'] == 'Beus') & ((data['D_e'] == 5.6) | (data['D_h'] == 15.2)) & (data['length'].isnull()), 'length'] = 2134
data.loc[(data['author'] == 'Beus') & (data['geometry'].isnull()), 'geometry'] = 'annulus'

#Janssen
data.loc[(data['author'] == 'Janssen') & (data['D_e'] == 12.7) & (data['length'] < 1200.0) & (data['D_h'].isnull()), 'D_h'] = 38.1
data.loc[(data['author'] == 'Janssen') & (data['D_e'] == 12.7) & (data['length'] > 1200.0) & (data['D_h'].isnull()), 'D_h'] = 42.3
data.loc[(data['author'] == 'Janssen') & ((data['D_h'] == 38.1) | (data['D_h'] == 42.3)) & (data['D_e'].isnull()), 'D_e'] = 12.7

data.loc[(data['author'] == 'Janssen') & ((data['D_h'] == 15.9) | (data['length'] == 914)) & (data['D_e'].isnull()), 'D_e'] = 6.4
data.loc[(data['author'] == 'Janssen') & ((data['D_e'] == 6.4) | (data['length'] == 914)) & (data['D_h'].isnull()), 'D_h'] = 15.9
data.loc[(data['author'] == 'Janssen') & ((data['D_e'] == 6.4) | (data['D_h'] == 15.9)) & (data['length'].isnull()), 'length'] = 914

data.loc[(data['author'] == 'Janssen') & (data['D_h'] == 96.3) & (data['length'] == 1778) & (data['D_e'].isnull()), 'D_e'] = 22.2
data.loc[(data['author'] == 'Janssen') & (data['D_e'] == 22.2) & (data['length'] == 1778) & (data['D_h'].isnull()), 'D_h'] = 96.3
data.loc[(data['author'] == 'Janssen') & (data['D_e'] == 22.2) & (data['D_h'] == 96.3) & (data['length'].isnull()), 'length'] = 1778

data.loc[(data['author'] == 'Janssen') & (data['D_h'] == 24.6) & (data['D_e'].isnull()), 'D_e'] = 8.5
data.loc[(data['author'] == 'Janssen') & (data['D_e'] == 8.5) & (data['length'] == 1778) & (data['D_h'].isnull()), 'D_h'] = 24.6
data.loc[(data['author'] == 'Janssen') & (data['D_e'] == 8.5) & (data['D_h'] == 24.6) & (data['length'].isnull()), 'length'] = 1778

data.loc[(data['author'] == 'Janssen') & (data['geometry'].isnull()), 'geometry'] = 'annulus'

#Thompson
data.loc[(data['author'] == 'Thompson') & (data['D_h'].notnull()) & (data['D_e'].isnull()), 'D_e'] = data.loc[(data['author'] == 'Thompson') & (data['D_h'].notnull()) & (data['D_e'].isnull()), 'D_h']
data.loc[(data['author'] == 'Thompson') & (data['D_e'].notnull()) & (data['D_h'].isnull()), 'D_h'] = data.loc[(data['author'] == 'Thompson') & (data['D_e'].notnull()) & (data['D_h'].isnull()), 'D_e']

data.loc[(data['author'] == 'Thompson') & (data['geometry'].isnull()), 'geometry'] = 'tube'

In [23]:
#Save original target NaN values, add original dataset to aid imputing, grab number of rows of synthetic data

target = data[target_col]
data_all = pd.concat([data, data_ext], ignore_index=True)

data_nrows = data.shape[0]


In [24]:
#Use K-nearest neighbors for imputation of remaining numerical values

knni = KNNImputer(n_neighbors = 89)
data_imp = knni.fit_transform(data_all.select_dtypes(include=np.number))
data_imp = pd.DataFrame(data_imp, columns=data_all.select_dtypes(include=np.number).columns)
data_all = pd.concat([data_all.iloc[:,:2], data_imp], axis=1)

In [25]:
#Remove original dataset from data and restore NaN target values

data = data_all.iloc[:data_nrows]
data.loc[:,target_col] = target

In [26]:
# Remove any remaining NaN values from categorical columns

#Inasaka
data.loc[(data['D_h'] == 3.0) & (data['D_e'] == 3.0) & (data['length'] == 100.0) & (data['author'].isnull()), 'author'] = 'Inasaka'
data.loc[(data['author'] == 'Inasaka') & (data['geometry'].isnull()), 'geometry'] = 'tube'

#Kossolapov (too similar to Richenderfer, and less common)
#data.loc[(data['D_h'] == 120.0) & (data['D_e'] == 15.0) & (data['length'] == 10.0) & (data['author'].isnull()), 'author'] = 'Kossolapov'
data.loc[(data['author'] == 'Kossolapov') & (data['geometry'].isnull()), 'geometry'] = 'plate'

#Mortimore
data.loc[(data['D_h'] == 13.3) & (data['D_e'] == 5.0) & (data['length'] == 2134.0) & (data['author'].isnull()), 'author'] = 'Mortimore'
data.loc[(data['author'] == 'Mortimore') & (data['geometry'].isnull()), 'geometry'] = 'annulus'

#Richenderfer
data.loc[(data['D_h'] == 120.0) & (data['D_e'] == 15.0) & (data['length'] == 10.0) & (data['author'].isnull()), 'author'] = 'Richenderfer'
data.loc[(data['author'] == 'Richenderfer') & (data['geometry'].isnull()), 'geometry'] = 'plate'

#Peskov (This overlaps a bit with Thompson, but a D_e and D_h of 10,10 is more commonly Peskov)
data.loc[(data['D_h'] == 10.0) & (data['D_e'] == 10.0) & (data['author'].isnull()), 'author'] = 'Peskov'
data.loc[(data['author'] == 'Peskov') & (data['geometry'].isnull()), 'geometry'] = 'tube'

#Williams (This overlaps a bit with Thompson, but a D_e and D_h of 9.5,9.5 is more commonly Williams)
data.loc[(data['D_h'] == 9.5) & (data['D_e'] == 9.5) & (data['length'] == 1836) & (data['author'].isnull()), 'author'] = 'Williams'
data.loc[(data['author'] == 'Williams') & (data['geometry'].isnull()), 'geometry'] = 'tube'

#Weatherhead (This overlaps a bit with Thompson, but a D_e and D_h of 7.7,7.7 is more commonly Weatherhead)
data.loc[(data['D_h'] == 7.7) & (data['D_e'] == 7.7) & (data['length'] == 457) & (data['author'].isnull()), 'author'] = 'Weatherhead'
data.loc[(data['D_h'] == 11.1) & (data['D_e'] == 11.1) & (data['length'] == 457) & (data['author'].isnull()), 'author'] = 'Weatherhead'
data.loc[(data['author'] == 'Weatherhead') & (data['geometry'].isnull()), 'geometry'] = 'tube'

#Beus
data.loc[(data['D_h'] == 15.2) & (data['D_e'] == 5.6) & (data['length'] == 2134.0) & (data['author'].isnull()), 'author'] = 'Beus'
data.loc[(data['author'] == 'Beus') & (data['geometry'].isnull()), 'geometry'] = 'annulus'

#Janssen
data.loc[((data['D_h'] == 38.1) | (data['D_h'] == 42.3)) & (data['D_e'] == 12.7) & (data['author'].isnull()), 'author'] = 'Janssen'

data.loc[(data['D_h'] == 15.9) & (data['D_e'] == 6.4) & (data['length'] == 914.0) & (data['author'].isnull()), 'author'] = 'Janssen'

data.loc[(data['D_h'] == 96.3) & (data['D_e'] == 22.2) & (data['length'] == 1778.0) & (data['author'].isnull()), 'author'] = 'Janssen'

data.loc[(data['D_h'] == 24.6) & (data['D_e'] == 8.5) & (data['length'] == 1778.0) & (data['author'].isnull()), 'author'] = 'Janssen'

data.loc[(data['author'] == 'Janssen') & (data['geometry'].isnull()), 'geometry'] = 'annulus'

#Thompson
data.loc[data['author'].isnull(), 'author'] = 'Thompson'
data.loc[data['geometry'].isnull(), 'geometry'] = 'tube'

In [27]:
def featurecreation(data):
    # Feature Creation
    # Adiabatic surface area
    data.loc[:,'adiabatic_surface_area'] = data.loc[:,'D_e'] * data.loc[:,'length']

    
    # Surface area to horizontal diameter ratio
    data.loc[:,'surface_diameter_ratio'] = data.loc[:,'D_e'] / data.loc[:,'D_h']
    
    return data

In [28]:
#Create additional features

data = featurecreation(data)
data_ext = featurecreation(data_ext)

In [29]:
#Create train and test sets

data_tr = data[data[target_col].notnull()]

data_test = data[data[target_col].isnull()]

unique_targets = data_tr[target_col].unique()

In [30]:
#inspired by aldparis @adaubas https://www.kaggle.com/competitions/playground-series-s3e14/discussion/409242

class PLSRegressionWrapper(PLSRegression):
    def transform(self, X):
        return super().transform(X)


class FeatureEngineering():

    def __init__(self, cat_cols, num_cols, feat_group_A = ['pressure', 'mass_flux', 'chf_exp'], feat_group_B = ['pressure', 'mass_flux', 'chf_exp']):
        
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        
        self.feat_group_A = feat_group_A
        self.feat_group_B = feat_group_B
        self.groups = [self.feat_group_A, self.feat_group_B]

    def fit(self, x, y=None, n_components_A=0, n_components_B=0, method_A='pca', method_B='pca'):
        
        self.cat_transformer = make_pipeline(OneHotEncoder())
        self.num_transformer = make_pipeline(StandardScaler())
        
        
        self.preprocessor = ColumnTransformer(
                            transformers=[('cat', self.cat_transformer, self.cat_cols),
                                          ('num', self.num_transformer, self.num_cols)
                                          ])
        
        self.preprocessor.fit(x)
        
        self.n_components = [n_components_A, n_components_B]
        self.methods = [method_A, method_B]
        self.dim_red = [0]*len(self.n_components)
        
        for i in range(len(self.n_components)):
            
            if self.n_components[i] > 0:
                
                if self.methods[i] == 'pca':
                    self.dim_red[i] = PCA(n_components = self.n_components[i], random_state=8)
                    self.dim_red[i].fit(x[self.groups[i]])
                
                if self.methods[i] == 'umap':
                    self.dim_red[i] = UMAP(n_components = self.n_components[i], random_state=8)
                    self.dim_red[i].fit(x[self.groups[i]])
                    
                if self.methods[i] == 'pls':
                    self.dim_red[i] = PLSRegressionWrapper(n_components = self.n_components[i])
                    self.dim_red[i].fit(x[self.groups[i]], y)       
                    
        
        return self
    
    def transform(self, x, y=None):

        df = x.copy()
        df = self.preprocessor.transform(df)
        
        cols = []
        
        for cat in self.cat_cols:
            cols = cols + sorted(x[cat].unique().tolist())
        
        cols = cols + self.num_cols
        
        df = pd.DataFrame(df, columns=cols)
        
        for i in range(len(self.n_components)):
            
            if self.n_components[i] > 0:
                dr_cols = [f"{self.methods[i]}{i}_{j}" for j in range(self.n_components[i])]
                df[dr_cols] = self.dim_red[i].transform(df[self.groups[i]])
                
        
        return df

In [39]:
def predict_test(X_tr, y_tr, X_ext, y_ext, X_test):
    """
    predict_test uses KFold cross validation to return predictions
    y_pred for test set X_test by utilizing training data X_tr,y_tr 
    and external data X_ext,y_ext
    
    :X_tr:   Training data features
    :y_tr:   Training data target
    :X_ext:  External data features added to every fold during KFold cross validation
    :y_ext:  External data target added to every fold during KFold cross validation
    :X_test: Test dataset used for predictions
    
    :return y_pred: Predictions based on test dataset X_test
    """
    
    
    kf = KFold(n_splits=10, random_state=8, shuffle=True)
    y_pred = pd.Series(0, index=X_test.index)
    
    callbacks = [lgbm.log_evaluation(period=2001, show_stdv=False),
                 lgbm.early_stopping(stopping_rounds=100, verbose=False)]
    
    
    cat_cols_1 = ['author']
    num_cols_1 = ['pressure', 'mass_flux', 'D_e', 'D_h','length', 'chf_exp']
    features_1 = cat_cols_1 + num_cols_1
    
    cat_cols_2 = ['author']
    num_cols_2 = ['pressure', 'mass_flux', 'D_e', 'D_h','length', 'chf_exp', 'adiabatic_surface_area', 'surface_diameter_ratio']
    features_2 = cat_cols_2 + num_cols_2
    
    cat_cols_3 = ['author']
    num_cols_3 = ['pressure', 'mass_flux', 'D_e', 'D_h','length', 'chf_exp', 'adiabatic_surface_area', 'surface_diameter_ratio']
    features_3 = cat_cols_3 + num_cols_3
    
    cat_cols_4 = ['author']
    num_cols_4 = ['pressure', 'mass_flux', 'D_e', 'D_h','length', 'chf_exp']
    features_4 = cat_cols_4 + num_cols_4
    
    params_1 = {'objective' : 'regression_l2',
          'metric' : 'rmse',
          'n_estimators' : 2000,
          'learning_rate': 0.01016320397965528,
          'max_depth': 10,
          'num_leaves': 1239,
          'colsample_bytree': 0.342285821102964,
          'max_bin': 588,
          'min_child_samples': 17,
          'subsample': 0.4967871237928858,
          'reg_alpha': 0.011037673638980894,
          'reg_lambda': 2.67015907440627,
          'boosting_type': 'gbdt',
          'data_sample_strategy': 'goss',
          'verbose': -1} 
    
    params_2 = {'objective' : 'regression_l2',
          'metric' : 'rmse',
          'n_estimators' : 2000,
          'learning_rate': 0.014936477998424232,
          'max_depth': 9,
          'num_leaves': 338,
          'colsample_bytree': 0.2834900944718662,
          'max_bin': 939,
          'min_child_samples': 20,
          'subsample': 0.3882591137382151,
          'reg_alpha': 0.006987253765394774,
          'reg_lambda': 1.9803759175177449,
          'boosting_type': 'gbdt',
          'data_sample_strategy': 'goss',
          'verbose': -1}
    
    params_3 = {'objective' : 'regression_l2',
          'metric' : 'rmse',
          'n_estimators' : 2000,
          'learning_rate': 0.030194630334037888,
          'max_depth': 10,
          'num_leaves': 1456,
          'colsample_bytree': 0.3677827029249661,
          'max_bin': 910,
          'min_child_samples': 16,
          'subsample': 0.5430561217554016,
          'reg_alpha': 0.05766238378926456,
          'reg_lambda': 3.827008003976793,
          'boosting_type': 'gbdt',
          'verbose': -1}
    
    
    params_4 = {'objective' : 'reg:squarederror',
          'eval_metric' : 'rmse',
          'verbosity' : 0,
          'silence' : True,
          'n_estimators' : 2000,
          'learning_rate': 0.05957325414211462,
          'max_depth': 8,
          'colsample_bytree': 0.3892362281817108,
          'alpha': 0.3060792413899057,
          'lambda': 2.2730782167388046,
          'gamma': 0.007808066644093886,
          'max_bin': 526,
          'subsample': 0.7846145456232383,
          'booster': 'gbtree',
          'min_child_weight': 15}
    
    fe_1 = FeatureEngineering(cat_cols_1, num_cols_1)
    fe_2 = FeatureEngineering(cat_cols_2, num_cols_2)
    fe_3 = FeatureEngineering(cat_cols_3, num_cols_3)
    fe_4 = FeatureEngineering(cat_cols_4, num_cols_4)
    
    
    for i, [train_idx, val_idx] in enumerate(kf.split(X_tr, y_tr)):
        X_t, X_val = X_tr.iloc[train_idx], X_tr.iloc[val_idx]
        y_t, y_val = y_tr.iloc[train_idx], y_tr.iloc[val_idx]
        
        X_train = pd.concat([X_t, X_ext], ignore_index=True)
        y_train = pd.concat([y_t, y_ext], ignore_index=True)
        
        fe_1.fit(X_train, y_train, 0, 0)
        fe_2.fit(X_train, y_train, 0, 0)
        fe_3.fit(X_train, y_train, 1, 0, 'pls')      
        fe_4.fit(X_train, y_train, 0, 0)
        
        X_train_transform_1 = fe_1.transform(X_train[features_1])
        X_train_transform_2 = fe_2.transform(X_train[features_2])
        X_train_transform_3 = fe_3.transform(X_train[features_3])
        X_train_transform_4 = fe_4.transform(X_train[features_4])
        
        X_val_transform_1 = fe_1.transform(X_val[features_1])
        X_val_transform_2 = fe_2.transform(X_val[features_2])
        X_val_transform_3 = fe_3.transform(X_val[features_3])
        X_val_transform_4 = fe_4.transform(X_val[features_4])
        
        X_test_transform_1 = fe_1.transform(X_test[features_1])
        X_test_transform_2 = fe_2.transform(X_test[features_2])
        X_test_transform_3 = fe_3.transform(X_test[features_3])
        X_test_transform_4 = fe_4.transform(X_test[features_4])
        
        model_1 = LGBMRegressor(**params_1)
        model_2 = LGBMRegressor(**params_2)
        model_3 = LGBMRegressor(**params_3)
        model_4 = XGBRegressor(**params_4)
        
        model_1.fit(X_train_transform_1, y_train, eval_set=[(X_val_transform_1, y_val)], callbacks=callbacks)
        model_2.fit(X_train_transform_2, y_train, eval_set=[(X_val_transform_2, y_val)], callbacks=callbacks)
        model_3.fit(X_train_transform_3, y_train, eval_set=[(X_val_transform_3, y_val)], callbacks=callbacks)
        model_4.fit(X_train_transform_4, y_train, eval_set=[(X_val_transform_4, y_val)], early_stopping_rounds=100, verbose=0)
        
        
        y_pred += (0.25*model_1.predict(X_test_transform_1) + 0.25*model_2.predict(X_test_transform_2) + 0.1*model_3.predict(X_test_transform_3) + 0.4*model_4.predict(X_test_transform_4))/10
        
    return y_pred

In [32]:
sample = pd.read_csv("sample_submission.csv", index_col='id')
sample.head()

Unnamed: 0_level_0,x_e_out [-]
id,Unnamed: 1_level_1
4,0.0
7,0.0
10,0.0
12,0.0
23,0.0


In [40]:
X_tr = data_tr.drop(target_col, axis=1)
y_tr = data_tr[target_col]

X_ext = data_ext.drop(target_col, axis=1)
y_ext = data_ext[target_col]

X_test = data_test.drop(target_col, axis=1)


y_test = predict_test(X_tr, y_tr, X_ext, y_ext, X_test)
y_test = pd.DataFrame(y_test, index=sample.index, columns=['x_e_out [-]'])

y_test.head()

Unnamed: 0_level_0,x_e_out [-]
id,Unnamed: 1_level_1
4,-0.008476
7,-0.079818
10,-0.043033
12,0.005858
23,0.044331


In [41]:
y_test.to_csv('predictions.csv', columns=['x_e_out [-]'], index=True)