In [None]:
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
import lightgbm as lgb
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
from scipy import sparse
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt
import os
import gc
import gensim
from collections import defaultdict
from string import punctuation

In [None]:
class TfidfEmbeddingVectorizer:
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = next(iter(word2vec.values())).size
        print('Self dim', self.dim)
        self.digit = re.compile(r'(\d+)')
        
    def preproc(self, text):
        return [
            re.sub('\W+', '', t) for t in text.split() if not (t.isspace() or self.digit.search(t) or t in punctuation)
        ]

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x, tokenizer=self.preproc, stop_words='english', max_df=.95, min_df=2, binary=True)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w not in punctuation and w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])
        
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [None]:
with open("C:/Kaggle/News/KerasFiles/wiki-news-300d-1M.vec", "r", encoding="utf8") as lines:
    w2v = {line.split()[0]: np.fromiter(map(float, line.split()[1:]), dtype=np.float) for line in lines}

In [None]:
vect = TfidfEmbeddingVectorizer(w2v)

In [None]:
def cleaning_text(review, remove_stopwords=False, Lem=False):
    review_text = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub('[^a-zA-Z]',' ', review_text)
    review_text = re.sub('\s+',' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if Lem:
        words = [stemmer.lemmatize(w) for w in words] # Lemmatization
    review_text = (' '.join([word for word in words]))
    return(review_text)

In [None]:
train_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Train02.csv', encoding='ISO-8859-1')
test_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Test02.csv', encoding='ISO-8859-1')
FeatureNames = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\FeatureNames02.csv', encoding='ISO-8859-1')

train_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')
test_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')

train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]
test_other_models = test_other_models[['id','Price_Log_Pred']]

train = pd.merge(train_orig, train_other_models, on='id')
test = pd.merge(test_orig, test_other_models, on='id')

train['Price_Log'] = np.log10(train['Price']+1)
train.hist(column='Price_Log')

FeatureNames = FeatureNames['x'].values.tolist()

In [None]:
train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,False))
test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,False))

In [None]:
%%time
trn_mean_emb = vect.fit_transform(train.Synopsis2)
tst_mean_emb = vect.transform(test.Synopsis2)
print("Train Shape : ",trn_mean_emb.shape)
print("Test Shape : ",tst_mean_emb.shape)

In [None]:
trn_mean_emb = pd.DataFrame(trn_mean_emb)
tst_mean_emb = pd.DataFrame(tst_mean_emb)
FeatureNames2 = list(trn_mean_emb.columns)
FeatureNames2

In [None]:
train = pd.concat([train,trn_mean_emb],axis=1)
test = pd.concat([test,tst_mean_emb],axis=1)
print("Train Shape : ",train.shape)
print("Test Shape : ",test.shape)

In [None]:
fold_list = list(train.FOLD_NUM.unique())
fold_list.sort()
fold_list

In [None]:
param = {
    'learning_rate': 0.01,
    'max_depth': 10,
    'min_data_in_leaf': 3,
    'bagging_freq': 0,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.2,
    'boost': 'gbdt',    
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 392
}

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])
    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])
    
    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)

    IterationNum = IterationNum + 1

In [None]:
print("CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))
# LB SCORE : 0.7791

In [None]:
training_cv_predictions = train[['id','FOLD_NUM','Price','Price_Log']]

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB'] = sub_data_preds
test['Price_Log_Pred_LGB'].describe()

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def nn_model(input_shape):
    model = Sequential()
    model.add(Dense(200, input_dim=input_shape, activation='relu'))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(Dense(200, activation='relu'))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())
    model.add(Dense(1))
    return(model)

In [None]:
keras01_Models = []
IterationNum = 1
batch_size = 64
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    model = nn_model(temp_train_tf.shape[1])
    model.compile(loss = 'mean_squared_error', optimizer='adam',metrics = ['mean_squared_error'])
    print(model.summary())
    
    model_weights_save_path = 'C:/Kaggle/BooksPrice/KerasModels/'
    file_name = "201901004_Keras06_Model_Weights_Fold_"+str(fold_num)+'.h5'
    final_path = model_weights_save_path+file_name
    print("Model Weights File Name : ",final_path)
    keras01_Models.append(final_path)
    
    es = EarlyStopping(mode='min',
                       verbose=1,
                       patience=10)
    
    checkpointer = ModelCheckpoint(filepath=final_path,
                                   mode='min',
                                   verbose=1,
                                   save_best_only=True)
    
    history = model.fit(temp_train_tf, temp_train['Price_Log'].values,
                        epochs = 500,
                        batch_size = batch_size,
                        verbose = 1,
                        shuffle = 'batch',
                        validation_data = (temp_val_tf, temp_val['Price_Log'].values),
                        callbacks = [es,checkpointer])
    
    
    loaded_model = load_model(final_path)
    temp_val['Price_Log_Pred_Keras'] = loaded_model.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_Keras'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = loaded_model.predict(test[FeatureNames+FeatureNames2].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + loaded_model.predict(test[FeatureNames+FeatureNames2].values)

    IterationNum = IterationNum + 1

In [None]:
print(keras01_Models)
print("Keras 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_Keras']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_Keras'] = sub_data_preds
test['Price_Log_Pred_Keras'].describe()

In [None]:
from sklearn.linear_model import Ridge

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    ridgereg = Ridge(alpha=0.2,normalize=True)
    ridgereg.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_RIDGE'] = ridgereg.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RIDGE'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = ridgereg.predict(test[FeatureNames+FeatureNames2].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + ridgereg.predict(test[FeatureNames+FeatureNames2].values)

    IterationNum = IterationNum + 1

print("RIDGE 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RIDGE']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_RIDGE'] = sub_data_preds
test['Price_Log_Pred_RIDGE'].describe()

In [None]:
from sklearn.linear_model import Lasso

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    lassoreg = Lasso(alpha=0.000025,normalize=True, max_iter=1e6)
    lassoreg.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_LASSO'] = lassoreg.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LASSO'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = lassoreg.predict(test[FeatureNames+FeatureNames2].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + lassoreg.predict(test[FeatureNames+FeatureNames2].values)

    IterationNum = IterationNum + 1
    
print("LASSO 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LASSO']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LASSO'] = sub_data_preds
test['Price_Log_Pred_LASSO'].describe()

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    KNN = KNeighborsRegressor(n_neighbors = 12)
    KNN.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_KNN'] = KNN.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_KNN'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = KNN.predict(test[FeatureNames+FeatureNames2].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + KNN.predict(test[FeatureNames+FeatureNames2].values)

    IterationNum = IterationNum + 1

print("KNN 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_KNN']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_KNN'] = sub_data_preds
test['Price_Log_Pred_KNN'].describe()

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    RF = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 3, max_features = 60, random_state = 412,
                               verbose = 0, max_depth = 40)
    RF.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_RF'] = RF.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RF'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = RF.predict(test[FeatureNames+FeatureNames2])
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + RF.predict(test[FeatureNames+FeatureNames2])

    IterationNum = IterationNum + 1

In [None]:
print("RF 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RF']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_RF'] = sub_data_preds
test['Price_Log_Pred_RF'].describe()

In [None]:
import xgboost as xgb

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    dtrain = xgb.DMatrix(data = temp_train_tf, label = temp_train['Price_Log'])
    dtest = xgb.DMatrix(data = temp_val_tf, label = temp_val['Price_Log'])
                
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]
    
    params1 = { 'seed': 501,
                'colsample_bytree': 0.2,
                'verbosity': 1,
                'subsample': 0.95,
                'learning_rate': 0.01,
                'objective': 'reg:squarederror',
                'max_depth': 10,
                'min_child_weight': 3,
                'booster': 'gbtree',
                'eval_metric': 'rmse' }
    
    XGB = xgb.train(params = params1,
                    dtrain = dtrain,
                    num_boost_round = 10000,
                    evals = watchlist,
                    verbose_eval = 20,
                    early_stopping_rounds = 200)
    
    temp_val['Price_Log_Pred_XGB'] = XGB.predict(xgb.DMatrix(data = temp_val_tf), ntree_limit = XGB.best_ntree_limit)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_XGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = XGB.predict(xgb.DMatrix(data = test[FeatureNames+FeatureNames2].values), ntree_limit = XGB.best_ntree_limit)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + XGB.predict(xgb.DMatrix(data = test[FeatureNames+FeatureNames2].values), ntree_limit = XGB.best_ntree_limit)

    IterationNum = IterationNum + 1

In [None]:
print("XGB 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_XGB']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_XGB'] = sub_data_preds
test['Price_Log_Pred_XGB'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191007_Stack04_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191007_Stack04_DS.csv", index=False)

In [None]:
feature_names_ensemble = list(training_cv_predictions.columns[4:])
feature_names_ensemble

In [None]:
import seaborn as sns

In [None]:
corr = training_cv_predictions[feature_names_ensemble + ['Price_Log']].corr()
ax = sns.heatmap(
    corr, 
    vmin=0, vmax=1, center=0.5,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
param_ensemble = {
    'learning_rate': 0.001,
    'max_depth': 3,
    'min_data_in_leaf': 3,
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'feature_fraction': 0.5,
    'boost': 'gbdt',    
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 1234
}

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = training_cv_predictions[training_cv_predictions['FOLD_NUM'] != fold_num]
    temp_val = training_cv_predictions[training_cv_predictions['FOLD_NUM'] == fold_num]
    
    trn_data = lgb.Dataset(temp_train[feature_names_ensemble], label=temp_train['Price_Log'])
    val_data = lgb.Dataset(temp_val[feature_names_ensemble], label=temp_val['Price_Log'])
    
    model_e = lgb.train(param_ensemble,
                        trn_data,
                        20000000,
                        valid_sets = val_data,
                        verbose_eval=50,
                        early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_LGB_ENS'] = model_e.predict(temp_val[feature_names_ensemble],
                                                         num_iteration=model_e.best_iteration)
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB_ENS'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = model_e.predict(test[feature_names_ensemble], num_iteration=model_e.best_iteration)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + model_e.predict(test[feature_names_ensemble],
                                                          num_iteration=model_e.best_iteration)

    IterationNum = IterationNum + 1

In [None]:
print("LGB ENS CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB_ENS']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB_ENS'] = sub_data_preds
test['Price_Log_Pred_LGB_ENS'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191007_Stack04_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191007_Stack04_DS.csv", index=False)

In [None]:
submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')

sub_data_preds2 = (10**test['Price_Log_Pred_LGB_ENS'].values) - 1
pd.DataFrame(sub_data_preds2).describe()

submission['Price'] = sub_data_preds2
submission.to_excel('C:\\Kaggle\\BooksPrice\\Submissions\\20191007_Stack04_DS.xlsx', index=False)

In [None]:
pd.DataFrame(sub_data_preds2).describe()