In [48]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn import preprocessing
import optuna

# Step 2: Load the data

Next, we'll load the training and test data.  

We set `index_col=0` in the code cell below to use the `id` column to index the DataFrame.  (*If you're not sure how this works, try temporarily removing `index_col=0` and see how it changes the result.*)

In [45]:
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")
# Preview the data
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,3
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,7
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,3
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,6
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,8


In [46]:
# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

In [34]:
final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         tree_method='gpu_hist',
                         gpu_id=0, 
                         predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_1']
final_valid_preds.to_csv('valid_pred1.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_1']
submission_data.to_csv('test_pred1.csv', index=False)

[0]	validation_0-rmse:7.17455
[1000]	validation_0-rmse:0.72290
[2000]	validation_0-rmse:0.72151


KeyboardInterrupt: 

In [47]:
#model 2
import lightgbm as lgb


final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = lgb.LGBMRegressor(n_estimators=5000,
                         random_state=0,
                         **best_params,
                         device='gpu')

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_2']
final_valid_preds.to_csv('valid_pred2.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_2']
submission_data.to_csv('test_pred2.csv', index=False)

Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.522734
[2000]	valid_0's l2: 0.520559
[3000]	valid_0's l2: 0.52022
Early stopping, best iteration is:
[2967]	valid_0's l2: 0.520209
0 0.7212550067667121


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.518351
[2000]	valid_0's l2: 0.515797
[3000]	valid_0's l2: 0.515298
Early stopping, best iteration is:
[2992]	valid_0's l2: 0.51529
1 0.7178373633743628


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.518059
[2000]	valid_0's l2: 0.51555
[3000]	valid_0's l2: 0.515118
Early stopping, best iteration is:
[2952]	valid_0's l2: 0.515081
2 0.7176913196792831


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.515027
[2000]	valid_0's l2: 0.513042
Early stopping, best iteration is:
[2485]	valid_0's l2: 0.512817
3 0.7161121189696065


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.528252
[2000]	valid_0's l2: 0.525516
[3000]	valid_0's l2: 0.524993
Early stopping, best iteration is:
[3384]	valid_0's l2: 0.524936
4 0.7245279252822381


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.518847
[2000]	valid_0's l2: 0.516447
[3000]	valid_0's l2: 0.51607
[4000]	valid_0's l2: 0.516071
Early stopping, best iteration is:
[3757]	valid_0's l2: 0.516007
5 0.718340210569739


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.517954
[2000]	valid_0's l2: 0.515511
[3000]	valid_0's l2: 0.515313
Early stopping, best iteration is:
[2885]	valid_0's l2: 0.515233
6 0.7177983002786967


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.511028
[2000]	valid_0's l2: 0.508706
Early stopping, best iteration is:
[2583]	valid_0's l2: 0.508466
7 0.7130682709452598


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.51866
[2000]	valid_0's l2: 0.516062
[3000]	valid_0's l2: 0.515364
Early stopping, best iteration is:
[3609]	valid_0's l2: 0.515231
8 0.7177985796760998


Please use predictor argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's l2: 0.511396
[2000]	valid_0's l2: 0.509107
[3000]	valid_0's l2: 0.508447
Early stopping, best iteration is:
[3376]	valid_0's l2: 0.508272
9 0.7129318283875675
0.7177360923929565 0.003261955631323601


In [73]:
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

#model 3
from catboost import CatBoostRegressor

final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'subsample': 0.8031450486786944
                  }
    model = CatBoostRegressor(iterations=200,
                         random_state=0,
                         **best_params,
                         task_type='GPU',
                         devices='0'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_3']
final_valid_preds.to_csv('valid_pred3.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_3']
submission_data.to_csv('test_pred3.csv', index=False)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.7456224	test: 0.7487633	best: 0.7487633 (0)	total: 6.94ms	remaining: 1.38s
199:	learn: 0.7323407	test: 0.7358480	best: 0.7358480 (199)	total: 1.14s	remaining: 0us
bestTest = 0.7358479908
bestIteration = 199
0 0.7358479150269729
0:	learn: 0.7459196	test: 0.7458535	best: 0.7458535 (0)	total: 6.83ms	remaining: 1.36s
199:	learn: 0.7324704	test: 0.7332655	best: 0.7332655 (199)	total: 1.13s	remaining: 0us
bestTest = 0.7332655405
bestIteration = 199
1 0.7332655658536577
0:	learn: 0.7460566	test: 0.7444133	best: 0.7444133 (0)	total: 6.67ms	remaining: 1.33s
199:	learn: 0.7323208	test: 0.7320975	best: 0.7320975 (199)	total: 1.07s	remaining: 0us
bestTest = 0.7320975101
bestIteration = 199
2 0.732097571578492
0:	learn: 0.7460805	test: 0.7441788	best: 0.7441788 (0)	total: 7.16ms	remaining: 1.43s
199:	learn: 0.7331103	test: 0.7316573	best: 0.7316573 (199)	total: 1.17s	remaining: 0us
bestTest = 0.7316572717
bestIteration = 199
3 0.7316573611236431
0:	learn: 0.7448066	test: 0.7556985	best:



In [None]:
# Load the training data
train = pd.read_csv("../input/train-kfolds/train-folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# Preview the data
train.head()

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

# standardization
numerical_cols = [col for col in train.columns if col.startswith('cont')]

scaler = preprocessing.StandardScaler()
xtest[numerical_cols] = scaler.fit_transform(xtest[numerical_cols])


final_preds = []
rmse_valid = []
for fold in range(5):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.fit_transform(xvalid[numerical_cols])
    
    
    model = XGBRegressor(colsample_bytree=0.2,
                         learning_rate=0.6,max_depth=4, alpha=30,
                         n_estimators=200, tree_method='gpu_hist',
                         gpu_id=0, predictor='gpu_predictor')

    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    rmse_valid.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_valid), np.std(rmse_valid))

In [74]:

#Model 3

# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]


oneHotEnc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')

xtest = test[features]
xtest = xtest.copy()
xtest_ohe = oneHotEnc.fit_transform(xtest[object_cols])
xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtest_ohe.shape[1])])
xtest = pd.concat([xtest, xtest_ohe], axis = 1)
xtest = xtest.drop(object_cols, axis = 1)

scaler = preprocessing.StandardScaler()
xtest = scaler.fit_transform(xtest)

final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain_ohe= oneHotEnc.fit_transform(xtrain[object_cols])
    xvalid_ohe = oneHotEnc.transform(xvalid[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f'ohe_{i}' for i
                                             in range(xvalid_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         tree_method='gpu_hist',
                         gpu_id=0, 
                         predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_3']
final_valid_preds.to_csv('valid_pred3.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_3']
submission_data.to_csv('test_pred3.csv', index=False)

[0]	validation_0-rmse:7.17455
[1000]	validation_0-rmse:0.72301
[2000]	validation_0-rmse:0.72151
[2784]	validation_0-rmse:0.72136
0 0.7213148461565618
[0]	validation_0-rmse:7.16555
[1000]	validation_0-rmse:0.71977
[2000]	validation_0-rmse:0.71808
[3000]	validation_0-rmse:0.71777
[3163]	validation_0-rmse:0.71781
1 0.71774507954462
[0]	validation_0-rmse:7.17125
[1000]	validation_0-rmse:0.71940
[2000]	validation_0-rmse:0.71775
[2879]	validation_0-rmse:0.71750
2 0.7174274006019555
[0]	validation_0-rmse:7.17485
[1000]	validation_0-rmse:0.71797
[2000]	validation_0-rmse:0.71682
[2613]	validation_0-rmse:0.71684
3 0.7167750349291848
[0]	validation_0-rmse:7.17640
[1000]	validation_0-rmse:0.72627
[2000]	validation_0-rmse:0.72432
[3000]	validation_0-rmse:0.72396
[3622]	validation_0-rmse:0.72389
4 0.7238720665529246
[0]	validation_0-rmse:7.17319
[1000]	validation_0-rmse:0.72032
[2000]	validation_0-rmse:0.71859
[3000]	validation_0-rmse:0.71828
[3173]	validation_0-rmse:0.71830
5 0.7182100358864684
[0]

In [69]:
#Model 4
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]


oneHotEnc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')

xtest = test[features]
xtest = xtest.copy()
xtest_ohe = oneHotEnc.fit_transform(xtest[object_cols])
xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtest_ohe.shape[1])])
xtest = pd.concat([xtest, xtest_ohe], axis = 1)
xtest = xtest.drop(object_cols, axis = 1)

#scaler = preprocessing.StandardScaler()
#xtest = scaler.fit_transform(xtest)

final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain_ohe= oneHotEnc.fit_transform(xtrain[object_cols])
    xvalid_ohe = oneHotEnc.transform(xvalid[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f'ohe_{i}' for i
                                             in range(xvalid_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         tree_method='gpu_hist',
                         gpu_id=0, 
                         predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_4']
final_valid_preds.to_csv('valid_pred4.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_4']
submission_data.to_csv('test_pred4.csv', index=False)

[0]	validation_0-rmse:7.17455
[1000]	validation_0-rmse:0.72301
[2000]	validation_0-rmse:0.72154
[3000]	validation_0-rmse:0.72136
[3084]	validation_0-rmse:0.72135
0 0.7213187453491325
[0]	validation_0-rmse:7.16555
[1000]	validation_0-rmse:0.71973
[2000]	validation_0-rmse:0.71803
[2848]	validation_0-rmse:0.71778
1 0.7177792843575497
[0]	validation_0-rmse:7.17125
[1000]	validation_0-rmse:0.71949
[2000]	validation_0-rmse:0.71779
[2945]	validation_0-rmse:0.71754
2 0.7174973968392352
[0]	validation_0-rmse:7.17485
[1000]	validation_0-rmse:0.71800
[2000]	validation_0-rmse:0.71682
[2150]	validation_0-rmse:0.71688
3 0.7167809030107852
[0]	validation_0-rmse:7.17640
[1000]	validation_0-rmse:0.72632
[2000]	validation_0-rmse:0.72444
[3000]	validation_0-rmse:0.72410
[3451]	validation_0-rmse:0.72407
4 0.724044439569008
[0]	validation_0-rmse:7.17319
[1000]	validation_0-rmse:0.72032
[2000]	validation_0-rmse:0.71863
[3000]	validation_0-rmse:0.71834
[3204]	validation_0-rmse:0.71834
5 0.7182868281001097
[0

In [None]:
#create new train dataset from valid predictions
df = pd.read_csv('../input/train10fold/train-folds (1).csv')
df_test = pd.read_csv('../input/30daysofml/test.csv')
submission_data = pd.read_csv('../input/30daysofmlsubmisison/sample_submission.csv')

df1 = pd.read_csv('./valid_pred1.csv')
df2 = pd.read_csv('./valid_pred2.csv')
df3 = pd.read_csv('./valid_pred3.csv')
df4 = pd.read_csv('./valid_pred4.csv')

df_test1 = pd.read_csv('./test_pred1.csv')
df_test2 = pd.read_csv('./test_pred2.csv')
df_test3 = pd.read_csv('./test_pred3.csv')
df_test4 = pd.read_csv('./test_pred4.csv')

df = df.merge(df1, on='id', how='left')
df = df.merge(df1, on='id', how='left')
df = df.merge(df1, on='id', how='left')
df = df.merge(df1, on='id', how='left')

df_test = df_test.merge(df_test1, on='id', how='left')
df_test = df_test.merge(df_test2, on='id', how='left')
df_test = df_test.merge(df_test3, on='id', how='left')
df_test = df_test.merge(df_test4, on='id', how='left')

df.head()

In [None]:
useful_features = ['pred1', 'pred2', 'pred3', 'pred4']
df_test = df_test[useful_features]

x_test = df_test.copy()

final_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'max_depth': 3,
                     'subsample': 0.8031450486786944,
                     'alpha': 0.9
                  }
    model = LinearRegression(random_state=0)
    model.fit(xtrain, ytrain)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

In [None]:
submission_data.target = np.mean(np.column_stack(final_preds), axis = 1)
submission_data.columns = ['id', 'pred_4']
submission_data.to_csv('submission.csv', index=False)