In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn import preprocessing
import optuna

# Step 2: Load the data

Next, we'll load the training and test data.  

We set `index_col=0` in the code cell below to use the `id` column to index the DataFrame.  (*If you're not sure how this works, try temporarily removing `index_col=0` and see how it changes the result.*)

In [3]:
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")
# Preview the data
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,3
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,7
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,3
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,6
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,8


In [4]:
# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

In [5]:
final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_1']
final_valid_preds.to_csv('valid_pred1.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_1']
submission_data.to_csv('test_pred1.csv', index=False)

[0]	validation_0-rmse:7.17455
[1000]	validation_0-rmse:0.72290
[2000]	validation_0-rmse:0.72151
[3000]	validation_0-rmse:0.72127
[3370]	validation_0-rmse:0.72133
0 0.7212725942845155
[0]	validation_0-rmse:7.16556
[1000]	validation_0-rmse:0.71983
[2000]	validation_0-rmse:0.71795
[3000]	validation_0-rmse:0.71753
[3326]	validation_0-rmse:0.71758
1 0.7175117673071886
[0]	validation_0-rmse:7.17122
[1000]	validation_0-rmse:0.71942
[2000]	validation_0-rmse:0.71776
[2761]	validation_0-rmse:0.71755
2 0.7175037585245023
[0]	validation_0-rmse:7.17485
[1000]	validation_0-rmse:0.71787
[2000]	validation_0-rmse:0.71667
[2646]	validation_0-rmse:0.71671
3 0.7166240188322381
[0]	validation_0-rmse:7.17642
[1000]	validation_0-rmse:0.72622
[2000]	validation_0-rmse:0.72446
[3000]	validation_0-rmse:0.72424
[3409]	validation_0-rmse:0.72429
4 0.7242002627469918
[0]	validation_0-rmse:7.17321
[1000]	validation_0-rmse:0.72021
[2000]	validation_0-rmse:0.71844
[3000]	validation_0-rmse:0.71805
[3880]	validation_0-rm

In [6]:
#model 2
import lightgbm as lgb


final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = lgb.LGBMRegressor(n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #device='gpu'
                             )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_2']
final_valid_preds.to_csv('valid_pred2.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_2']
submission_data.to_csv('test_pred2.csv', index=False)

Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.522734
[2000]	valid_0's l2: 0.520559
[3000]	valid_0's l2: 0.52022
Early stopping, best iteration is:
[2967]	valid_0's l2: 0.520209
0 0.7212550067583782
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.518351
[2000]	valid_0's l2: 0.515797
[3000]	valid_0's l2: 0.515298
Early stopping, best iteration is:
[2992]	valid_0's l2: 0.51529
1 0.7178373633773891
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.518059
[2000]	valid_0's l2: 0.51555
[3000]	valid_0's l2: 0.515118
Early stopping, best iteration is:
[2952]	valid_0's l2: 0.515081
2 0.7176913197063105
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.515027
[2000]	valid_0's l2: 0.513134
Early stopping, best iteration is:
[2290]	valid_0's l2: 0.512985
3 0.7162294615462711
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2



In [7]:

#Model 3

# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]


oneHotEnc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')

xtest = test[features]
xtest = xtest.copy()
xtest_ohe = oneHotEnc.fit_transform(xtest[object_cols])
xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtest_ohe.shape[1])])
xtest = pd.concat([xtest, xtest_ohe], axis = 1)
xtest = xtest.drop(object_cols, axis = 1)

scaler = preprocessing.StandardScaler()
xtest = scaler.fit_transform(xtest)

final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain_ohe= oneHotEnc.fit_transform(xtrain[object_cols])
    xvalid_ohe = oneHotEnc.transform(xvalid[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f'ohe_{i}' for i
                                             in range(xvalid_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_3']
final_valid_preds.to_csv('valid_pred3.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_3']
submission_data.to_csv('test_pred3.csv', index=False)

[0]	validation_0-rmse:7.17455
[1000]	validation_0-rmse:0.72301
[2000]	validation_0-rmse:0.72151
[2784]	validation_0-rmse:0.72136
0 0.7213148461565618
[0]	validation_0-rmse:7.16555
[1000]	validation_0-rmse:0.71977
[2000]	validation_0-rmse:0.71808
[3000]	validation_0-rmse:0.71777
[3164]	validation_0-rmse:0.71780
1 0.71774507954462
[0]	validation_0-rmse:7.17125
[1000]	validation_0-rmse:0.71940
[2000]	validation_0-rmse:0.71775
[2879]	validation_0-rmse:0.71750
2 0.7174274006019555
[0]	validation_0-rmse:7.17485
[1000]	validation_0-rmse:0.71797
[2000]	validation_0-rmse:0.71682
[2612]	validation_0-rmse:0.71684
3 0.7167750349291848
[0]	validation_0-rmse:7.17640
[1000]	validation_0-rmse:0.72627
[2000]	validation_0-rmse:0.72432
[3000]	validation_0-rmse:0.72396
[3622]	validation_0-rmse:0.72389
4 0.7238720665529246
[0]	validation_0-rmse:7.17319
[1000]	validation_0-rmse:0.72032
[2000]	validation_0-rmse:0.71859
[3000]	validation_0-rmse:0.71828
[3172]	validation_0-rmse:0.71830
5 0.7182100358864684
[0]

In [8]:
#Model 4
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]


oneHotEnc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')

xtest = test[features]
xtest = xtest.copy()
xtest_ohe = oneHotEnc.fit_transform(xtest[object_cols])
xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtest_ohe.shape[1])])
xtest = pd.concat([xtest, xtest_ohe], axis = 1)
xtest = xtest.drop(object_cols, axis = 1)

#scaler = preprocessing.StandardScaler()
#xtest = scaler.fit_transform(xtest)

final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain_ohe= oneHotEnc.fit_transform(xtrain[object_cols])
    xvalid_ohe = oneHotEnc.transform(xvalid[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f'ohe_{i}' for i
                                             in range(xvalid_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_4']
final_valid_preds.to_csv('valid_pred4.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_4']
submission_data.to_csv('test_pred4.csv', index=False)

[0]	validation_0-rmse:7.17455
[1000]	validation_0-rmse:0.72301
[2000]	validation_0-rmse:0.72154
[3000]	validation_0-rmse:0.72136
[3083]	validation_0-rmse:0.72135
0 0.7213187453491325
[0]	validation_0-rmse:7.16555
[1000]	validation_0-rmse:0.71973
[2000]	validation_0-rmse:0.71803
[2848]	validation_0-rmse:0.71778
1 0.7177792843575497
[0]	validation_0-rmse:7.17125
[1000]	validation_0-rmse:0.71949
[2000]	validation_0-rmse:0.71779
[2945]	validation_0-rmse:0.71754
2 0.7174973968392352
[0]	validation_0-rmse:7.17485
[1000]	validation_0-rmse:0.71800
[2000]	validation_0-rmse:0.71682
[2150]	validation_0-rmse:0.71688
3 0.7167809030107852
[0]	validation_0-rmse:7.17640
[1000]	validation_0-rmse:0.72632
[2000]	validation_0-rmse:0.72444
[3000]	validation_0-rmse:0.72410
[3452]	validation_0-rmse:0.72407
4 0.724044439569008
[0]	validation_0-rmse:7.17319
[1000]	validation_0-rmse:0.72032
[2000]	validation_0-rmse:0.71863
[3000]	validation_0-rmse:0.71834
[3205]	validation_0-rmse:0.71834
5 0.7182868281001097
[0

In [9]:
#model 5: 
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")


# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

# standardization
numerical_cols = [col for col in train.columns if col.startswith('cont')]

scaler = preprocessing.StandardScaler()
xtest[numerical_cols] = scaler.fit_transform(xtest[numerical_cols])


final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data

    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.fit_transform(xvalid[numerical_cols])
    
    
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_5']
final_valid_preds.to_csv('valid_pred5.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_5']
submission_data.to_csv('test_pred5.csv', index=False)

[0]	validation_0-rmse:7.17455
[1000]	validation_0-rmse:0.72353
[2000]	validation_0-rmse:0.72254
[2356]	validation_0-rmse:0.72258
0 0.7225288329141927
[0]	validation_0-rmse:7.16556
[1000]	validation_0-rmse:0.72132
[2000]	validation_0-rmse:0.72005
[3000]	validation_0-rmse:0.71982
[3237]	validation_0-rmse:0.71986
1 0.7197910302922584
[0]	validation_0-rmse:7.17122
[1000]	validation_0-rmse:0.72067
[2000]	validation_0-rmse:0.71983
[2748]	validation_0-rmse:0.71982
2 0.7197114577004486
[0]	validation_0-rmse:7.17485
[1000]	validation_0-rmse:0.71843
[2000]	validation_0-rmse:0.71761
[2394]	validation_0-rmse:0.71764
3 0.7175822779454631
[0]	validation_0-rmse:7.17642
[1000]	validation_0-rmse:0.72732
[2000]	validation_0-rmse:0.72597
[2736]	validation_0-rmse:0.72589
4 0.7258740487327159
[0]	validation_0-rmse:7.17321
[1000]	validation_0-rmse:0.72110
[2000]	validation_0-rmse:0.71996
[3000]	validation_0-rmse:0.71991
[3056]	validation_0-rmse:0.71991
5 0.7198240601896531
[0]	validation_0-rmse:7.17618
[100

In [10]:
#create new train dataset from valid predictions
df = pd.read_csv('../input/train10fold/train-folds (1).csv')
df_test = pd.read_csv('../input/30daysofml/test.csv')
submission_data = pd.read_csv('../input/30daysofmlsubmisison/sample_submission.csv')

df1 = pd.read_csv('./valid_pred1.csv')
df2 = pd.read_csv('./valid_pred2.csv')
df3 = pd.read_csv('./valid_pred3.csv')
df4 = pd.read_csv('./valid_pred4.csv')
df5 = pd.read_csv('./valid_pred5.csv')

df_test1 = pd.read_csv('./test_pred1.csv')
df_test2 = pd.read_csv('./test_pred2.csv')
df_test3 = pd.read_csv('./test_pred3.csv')
df_test4 = pd.read_csv('./test_pred4.csv')
df_test5 = pd.read_csv('./test_pred5.csv')

df = df.merge(df1, on='id', how='left')
df = df.merge(df2, on='id', how='left')
df = df.merge(df3, on='id', how='left')
df = df.merge(df4, on='id', how='left')
df = df.merge(df5, on='id', how='left')

df_test = df_test.merge(df_test1, on='id', how='left')
df_test = df_test.merge(df_test2, on='id', how='left')
df_test = df_test.merge(df_test3, on='id', how='left')
df_test = df_test.merge(df_test4, on='id', how='left')
df_test = df_test.merge(df_test5, on='id', how='left')

df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont11,cont12,cont13,target,kfold,pred_1,pred_2,pred_3,pred_4,pred_5
0,1,B,B,B,C,B,B,A,E,C,...,0.377873,0.322401,0.86985,8.113634,3,8.377886,8.336947,8.407614,8.396788,8.349377
1,2,B,B,A,A,B,D,A,F,A,...,0.921701,0.261975,0.465083,8.481233,7,8.243284,8.332997,8.316395,8.316636,8.293005
2,3,A,A,A,C,B,D,A,D,A,...,0.620126,0.541474,0.763846,8.364351,3,8.160936,8.1548,8.167449,8.153219,8.172601
3,4,B,B,A,C,B,D,A,E,C,...,0.71461,0.54015,0.280682,8.049253,6,8.392286,8.326371,8.406107,8.41251,8.360853
4,6,A,A,A,C,B,D,A,E,A,...,0.776742,0.625849,0.250823,7.97226,8,8.235176,8.23493,8.222899,8.280911,8.274248


In [11]:
df_test.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont9,cont10,cont11,cont12,cont13,pred_1,pred_2,pred_3,pred_4,pred_5
0,0,B,B,B,C,B,B,A,E,E,...,0.290258,0.244476,0.087914,0.301831,0.845702,8.117014,8.117014,8.23563,8.116377,8.309135
1,5,A,B,A,C,B,C,A,E,C,...,0.288276,0.549568,0.905097,0.850684,0.69394,8.410134,8.410134,8.355839,8.391582,8.422293
2,15,B,A,A,A,B,B,A,E,D,...,0.427871,0.491667,0.384315,0.376689,0.508099,8.389371,8.389371,8.39706,8.404661,8.393273
3,16,B,B,A,C,B,D,A,E,A,...,0.39109,0.98834,0.411828,0.393585,0.461372,8.481123,8.481123,8.582939,8.516331,8.648811
4,17,B,B,A,C,B,C,A,E,C,...,0.390253,0.648932,0.385935,0.370401,0.900412,8.15746,8.15746,8.239879,8.164751,8.288846


In [14]:
from sklearn.linear_model import LinearRegression

useful_features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']


xtest = df_test.copy()
xtest = df_test[useful_features]
xtest.head()

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
0,8.117014,8.117014,8.23563,8.116377,8.309135
1,8.410134,8.410134,8.355839,8.391582,8.422293
2,8.389371,8.389371,8.39706,8.404661,8.393273
3,8.481123,8.481123,8.582939,8.516331,8.648811
4,8.15746,8.15746,8.239879,8.164751,8.288846


In [15]:
from sklearn.linear_model import LinearRegression

useful_features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']


xtest = df_test.copy()
xtest = xtest[useful_features]

final_preds = []
rmse_score = []
for fold in range(10):
    xtrain = df[train.kfold != fold].reset_index(drop=True)
    xvalid = df[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

0 0.7208650359767519
1 0.7172846212504096
2 0.7171648960783967
3 0.7160883011756618
4 0.7237816844723031
5 0.7177422556459399
6 0.7172186741185006
7 0.7124847410763324
8 0.7174623623902057
9 0.7126163984894537
0.7172708970673956 0.0031867123503120723


In [16]:
submission_data.target = np.mean(np.column_stack(final_preds), axis = 1)
submission_data.to_csv('submission.csv', index=False)