In [None]:
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
from scipy import sparse
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.decomposition import TruncatedSVD

In [None]:
train = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Train08.csv')
test = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Test08.csv')

train['Price_Log'] = np.log10(train['Price']+1)
train.hist(column='Price_Log')

In [None]:
FeatureNames = list(train.columns[4:])
FeatureNames

In [None]:
fold_list = list(train.FOLD_NUM.unique())
fold_list.sort()
fold_list

In [None]:
training_cv_predictions = train[['id','FOLD_NUM','Price','Price_Log']]

In [None]:
param = {
    'learning_rate': 0.001,
    'max_depth': 10,
    'min_data_in_leaf': 3,
    'bagging_freq': 5,
    'bagging_fraction': 0.3,
    'feature_fraction': 0.3,
    'boost': 'gbdt',    
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 534
}

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames].values
    temp_val_tf = temp_val[FeatureNames].values
    
    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])
    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])
    
    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = model.predict(test[FeatureNames].values, num_iteration=model.best_iteration)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + model.predict(test[FeatureNames].values, num_iteration=model.best_iteration)

    IterationNum = IterationNum + 1

In [None]:
print("CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB'] = sub_data_preds
test['Price_Log_Pred_LGB'].describe()

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def nn_model(input_shape):
    model = Sequential()
    model.add(Dense(30, input_dim=input_shape, activation='elu'))
    model.add(Dropout(0.05))
    model.add(Dense(15, activation='elu'))
    model.add(Dropout(0.05))
    model.add(Dense(1))
    return(model)

In [None]:
keras01_Models = []
IterationNum = 1
batch_size = 64
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames].values
    temp_val_tf = temp_val[FeatureNames].values
    
    model = nn_model(temp_train_tf.shape[1])
    model.compile(loss = 'mean_squared_error', optimizer='adam',metrics = ['mean_squared_error'])
    print(model.summary())
    
    model_weights_save_path = 'C:/Kaggle/BooksPrice/KerasModels/'
    file_name = "20191018_Keras14_Model_Weights_Fold_"+str(fold_num)+'.h5'
    final_path = model_weights_save_path+file_name
    print("Model Weights File Name : ",final_path)
    keras01_Models.append(final_path)
    
    es = EarlyStopping(mode='min',
                       verbose=1,
                       patience=10)
    
    checkpointer = ModelCheckpoint(filepath=final_path,
                                   mode='min',
                                   verbose=1,
                                   save_best_only=True)
    
    history = model.fit(temp_train_tf, temp_train['Price_Log'].values,
                        epochs = 500,
                        batch_size = batch_size,
                        verbose = 1,
                        shuffle = 'batch',
                        validation_data = (temp_val_tf, temp_val['Price_Log'].values),
                        callbacks = [es,checkpointer])
    
    
    loaded_model = load_model(final_path)
    temp_val['Price_Log_Pred_Keras'] = loaded_model.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_Keras'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = loaded_model.predict(test[FeatureNames].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + loaded_model.predict(test[FeatureNames].values)

    IterationNum = IterationNum + 1

In [None]:
print(keras01_Models)
print("Keras 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_Keras']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_Keras'] = sub_data_preds
test['Price_Log_Pred_Keras'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Ensemble17_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Ensemble17_DS.csv", index=False)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames].values
    temp_val_tf = temp_val[FeatureNames].values
    
    ridgereg = Ridge(alpha=0.03,normalize=True)
    ridgereg.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_RIDGE'] = ridgereg.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RIDGE'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = ridgereg.predict(test[FeatureNames].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + ridgereg.predict(test[FeatureNames].values)

    IterationNum = IterationNum + 1

In [None]:
print("RIDGE 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RIDGE']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_RIDGE'] = sub_data_preds
test['Price_Log_Pred_RIDGE'].describe()

In [None]:
from sklearn.linear_model import Lasso

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames].values
    temp_val_tf = temp_val[FeatureNames].values
    
    lassoreg = Lasso(alpha=0.000009,normalize=True, max_iter=1e6)
    lassoreg.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_LASSO'] = lassoreg.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LASSO'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = lassoreg.predict(test[FeatureNames].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + lassoreg.predict(test[FeatureNames].values)

    IterationNum = IterationNum + 1

In [None]:
print("LASSO 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LASSO']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LASSO'] = sub_data_preds
test['Price_Log_Pred_LASSO'].describe()

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames].values
    temp_val_tf = temp_val[FeatureNames].values
    
    KNN = KNeighborsRegressor(n_neighbors = 51)
    KNN.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_KNN'] = KNN.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_KNN'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = KNN.predict(test[FeatureNames].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + KNN.predict(test[FeatureNames].values)

    IterationNum = IterationNum + 1

In [None]:
print("KNN 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_KNN']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_KNN'] = sub_data_preds
test['Price_Log_Pred_KNN'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Ensemble17_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Ensemble17_DS.csv", index=False)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames].values
    temp_val_tf = temp_val[FeatureNames].values
    
    RF = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 1, max_features = 9, random_state = 123,
                               verbose = 1, max_depth = 40)
    RF.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_RF'] = RF.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RF'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = RF.predict(test[FeatureNames].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + RF.predict(test[FeatureNames].values)

    IterationNum = IterationNum + 1

In [None]:
print("RF 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RF']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_RF'] = sub_data_preds
test['Price_Log_Pred_RF'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Ensemble17_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Ensemble17_DS.csv", index=False)

In [None]:
import xgboost as xgb

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames].values
    temp_val_tf = temp_val[FeatureNames].values
    
    dtrain = xgb.DMatrix(data = temp_train_tf, label = temp_train['Price_Log'])
    dtest = xgb.DMatrix(data = temp_val_tf, label = temp_val['Price_Log'])
                
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]
    
    params1 = { 'seed': 111,
                'colsample_bytree': 0.8,
                'verbosity': 1,
                'subsample': 0.3,
                'learning_rate': 0.01,
                'objective': 'reg:squarederror',
                'max_depth': 3,
                'min_child_weight': 1,
                'booster': 'gbtree',
                'eval_metric': 'rmse' }
    
    XGB = xgb.train(params = params1,
                    dtrain = dtrain,
                    num_boost_round = 1000000,
                    evals = watchlist,
                    verbose_eval = 50,
                    early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_XGB'] = XGB.predict(xgb.DMatrix(data = temp_val_tf), ntree_limit = XGB.best_ntree_limit)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_XGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = XGB.predict(xgb.DMatrix(data = test[FeatureNames].values), ntree_limit = XGB.best_ntree_limit)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + XGB.predict(xgb.DMatrix(data = test[FeatureNames].values), ntree_limit = XGB.best_ntree_limit)

    IterationNum = IterationNum + 1

In [None]:
print("XGB 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_XGB']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_XGB'] = sub_data_preds
test['Price_Log_Pred_XGB'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Ensemble17_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Ensemble17_DS.csv", index=False)

In [None]:
feature_names_ensemble = list(training_cv_predictions.columns[4:])

In [None]:
import seaborn as sns

In [None]:
corr = training_cv_predictions[feature_names_ensemble + ['Price_Log']].corr()
ax = sns.heatmap(
    corr, 
    vmin=0, vmax=1, center=0.5,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = training_cv_predictions[training_cv_predictions['FOLD_NUM'] != fold_num]
    temp_val = training_cv_predictions[training_cv_predictions['FOLD_NUM'] == fold_num]
    
    ridgereg_e = Ridge(alpha=0.002,normalize=True)
    ridgereg_e.fit(temp_train[feature_names_ensemble],temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_LGB_ENS'] = ridgereg_e.predict(temp_val[feature_names_ensemble])
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB_ENS'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = ridgereg_e.predict(test[feature_names_ensemble])
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + ridgereg_e.predict(test[feature_names_ensemble])

    IterationNum = IterationNum + 1

In [None]:
print("LGB ENS CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB_ENS']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB_ENS'] = sub_data_preds
test['Price_Log_Pred_LGB_ENS'].describe()

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = training_cv_predictions[training_cv_predictions['FOLD_NUM'] != fold_num]
    temp_val = training_cv_predictions[training_cv_predictions['FOLD_NUM'] == fold_num]
    
    trn_data = lgb.Dataset(temp_train[feature_names_ensemble], label=temp_train['Price_Log'])
    val_data = lgb.Dataset(temp_val[feature_names_ensemble], label=temp_val['Price_Log'])
    
    param = {
        'learning_rate': 0.001,
        'max_depth': 3,
        'min_data_in_leaf': 1,
        'bagging_freq': 5,
        'bagging_fraction': 0.3,
        'feature_fraction': 0.3,
        'boost': 'gbdt',    
        'objective': 'regression',
        'metric': 'rmse',
        'seed': 392
    }
    
    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_RIDGE_ENS'] = model.predict(temp_val[feature_names_ensemble], num_iteration=model.best_iteration)
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RIDGE_ENS'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = model.predict(test[feature_names_ensemble].values, num_iteration=model.best_iteration)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + model.predict(test[feature_names_ensemble].values, num_iteration=model.best_iteration)

    IterationNum = IterationNum + 1

In [None]:
print("LGB ENS CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE_ENS'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE_ENS'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RIDGE_ENS']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_RIDGE_ENS'] = sub_data_preds
test['Price_Log_Pred_RIDGE_ENS'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Ensemble17_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Ensemble17_DS.csv", index=False)

In [None]:
try_ens = 0.25*training_cv_predictions['Price_Log_Pred_RIDGE_ENS'] + 0.75*training_cv_predictions['Price_Log_Pred_LGB_ENS']

print("ENS RMSLE = ",sqrt(mean_squared_error(training_cv_predictions['Price_Log'], try_ens)))
print("1-RMSLE = ",1-sqrt(mean_squared_error(training_cv_predictions['Price_Log'], try_ens)))

In [None]:
training_cv_predictions['Price_Log_Pred_Final_Ens'] = try_ens
test['Price_Log_Pred_Final_Ens'] = 0.25*test['Price_Log_Pred_RIDGE_ENS'] + 0.75*test['Price_Log_Pred_LGB_ENS']

training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191025_Ensemble17_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191025_Ensemble17_DS.csv", index=False)

In [None]:
submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')

In [None]:
sub_data_preds2 = (10**test['Price_Log_Pred_Final_Ens'].values) - 1
pd.DataFrame(sub_data_preds2).describe()

In [None]:
submission['Price'] = sub_data_preds2
submission.to_excel('C:\\Kaggle\\BooksPrice\\Submissions\\20191025_Ensemble17_DS.xlsx', index=False)