In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn import preprocessing
import optuna

# Step 2: Load the data

Next, we'll load the training and test data.  

We set `index_col=0` in the code cell below to use the `id` column to index the DataFrame.  (*If you're not sure how this works, try temporarily removing `index_col=0` and see how it changes the result.*)

In [None]:
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")
# Preview the data
train.head()

In [None]:
# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

In [None]:
final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_1']
final_valid_preds.to_csv('valid_pred1.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_1']
submission_data.to_csv('test_pred1.csv', index=False)

In [None]:
#model 2
import lightgbm as lgb


final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = lgb.LGBMRegressor(n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #device='gpu'
                             )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_2']
final_valid_preds.to_csv('valid_pred2.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_2']
submission_data.to_csv('test_pred2.csv', index=False)

In [None]:

#Model 3

# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]


oneHotEnc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')

xtest = test[features]
xtest = xtest.copy()
xtest_ohe = oneHotEnc.fit_transform(xtest[object_cols])
xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtest_ohe.shape[1])])
xtest = pd.concat([xtest, xtest_ohe], axis = 1)
xtest = xtest.drop(object_cols, axis = 1)

scaler = preprocessing.StandardScaler()
xtest = scaler.fit_transform(xtest)

final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain_ohe= oneHotEnc.fit_transform(xtrain[object_cols])
    xvalid_ohe = oneHotEnc.transform(xvalid[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f'ohe_{i}' for i
                                             in range(xvalid_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_3']
final_valid_preds.to_csv('valid_pred3.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_3']
submission_data.to_csv('test_pred3.csv', index=False)

In [None]:
#Model 4
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")

# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
# Separate object columns from features
object_cols = [col for col in features if 'cat' in col]
# Separate numerical columns from the features
numerical_cols = [col for col in train.columns if col.startswith('cont')]


oneHotEnc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')

xtest = test[features]
xtest = xtest.copy()
xtest_ohe = oneHotEnc.fit_transform(xtest[object_cols])
xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtest_ohe.shape[1])])
xtest = pd.concat([xtest, xtest_ohe], axis = 1)
xtest = xtest.drop(object_cols, axis = 1)

#scaler = preprocessing.StandardScaler()
#xtest = scaler.fit_transform(xtest)

final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data
        
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain_ohe= oneHotEnc.fit_transform(xtrain[object_cols])
    xvalid_ohe = oneHotEnc.transform(xvalid[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f'ohe_{i}' for i
                                             in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f'ohe_{i}' for i
                                             in range(xvalid_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_4']
final_valid_preds.to_csv('valid_pred4.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_4']
submission_data.to_csv('test_pred4.csv', index=False)

In [None]:
#model 5: 
# Load the training data
train = pd.read_csv("../input/train10fold/train-folds (1).csv")
test = pd.read_csv("../input/30daysofml/test.csv")
submission_data = pd.read_csv("../input/30daysofmlsubmisison/sample_submission.csv")


# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

# standardization
numerical_cols = [col for col in train.columns if col.startswith('cont')]

scaler = preprocessing.StandardScaler()
xtest[numerical_cols] = scaler.fit_transform(xtest[numerical_cols])


final_valid_preds = {}
final_test_preds = []
rmse_score = []
for fold in range(10):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    valid_ids = xvalid.id.values.tolist() # to keep all the ids of the validation data

    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.fit_transform(xvalid[numerical_cols])
    
    
    xtrain = scaler.fit_transform(xtrain)
    xvalid = scaler.transform(xvalid)
    
    best_params = {'learning_rate': 0.07853392035787837,
                     'colsample_bytree': 0.170759104940733,
                     'max_depth': 3,
                     'reg_lambda': 1.7549293092194938e-05,
                     'reg_alpha': 14.68267919457715,
                     'subsample': 0.8031450486786944,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=5000,
                         random_state=0,
                         **best_params,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain, 
              early_stopping_rounds=300, 
              eval_set=[(xvalid, yvalid)], 
              verbose=1000)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    final_valid_preds.update(dict(zip(valid_ids, valid_preds)))
    final_test_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

final_valid_preds = pd.DataFrame.from_dict(final_valid_preds,orient='index').reset_index()
final_valid_preds.columns = ['id', 'pred_5']
final_valid_preds.to_csv('valid_pred5.csv', index=False)

submission_data.target = np.mean(np.column_stack(final_test_preds), axis = 1)
submission_data.columns = ['id', 'pred_5']
submission_data.to_csv('test_pred5.csv', index=False)

In [None]:
#create new train dataset from valid predictions
df = pd.read_csv('../input/train10fold/train-folds (1).csv')
df_test = pd.read_csv('../input/30daysofml/test.csv')
submission_data = pd.read_csv('../input/30daysofmlsubmisison/sample_submission.csv')

df1 = pd.read_csv('./valid_pred1.csv')
df2 = pd.read_csv('./valid_pred2.csv')
df3 = pd.read_csv('./valid_pred3.csv')
df4 = pd.read_csv('./valid_pred4.csv')
df5 = pd.read_csv('./valid_pred5.csv')

df_test1 = pd.read_csv('./test_pred1.csv')
df_test2 = pd.read_csv('./test_pred2.csv')
df_test3 = pd.read_csv('./test_pred3.csv')
df_test4 = pd.read_csv('./test_pred4.csv')
df_test5 = pd.read_csv('./test_pred5.csv')

df = df.merge(df1, on='id', how='left')
df = df.merge(df2, on='id', how='left')
df = df.merge(df3, on='id', how='left')
df = df.merge(df4, on='id', how='left')
df = df.merge(df5, on='id', how='left')

df_test = df_test.merge(df_test1, on='id', how='left')
df_test = df_test.merge(df_test2, on='id', how='left')
df_test = df_test.merge(df_test3, on='id', how='left')
df_test = df_test.merge(df_test4, on='id', how='left')
df_test = df_test.merge(df_test5, on='id', how='left')

df.head()

In [None]:
df_test.head()

In [None]:
from sklearn.linear_model import LinearRegression

useful_features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']


xtest = df_test.copy()
xtest = df_test[useful_features]
xtest.head()

In [None]:
from sklearn.linear_model import LinearRegression

useful_features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']


xtest = df_test.copy()
xtest = xtest[useful_features]

final_preds = []
rmse_score = []
for fold in range(10):
    xtrain = df[train.kfold != fold].reset_index(drop=True)
    xvalid = df[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
        
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)

    rmse = mean_squared_error(yvalid, valid_preds, squared=False)
    rmse_score.append(rmse)
    print(fold, rmse)
    
print(np.mean(rmse_score), np.std(rmse_score))

In [None]:
submission_data.target = np.mean(np.column_stack(final_preds), axis = 1)
submission_data.to_csv('submission.csv', index=False)