In [None]:
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
import lightgbm as lgb
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
from scipy import sparse
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt
from gensim.models import Word2Vec
import nltk
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
import sys
import gensim
from gensim.models.doc2vec import LabeledSentence

In [None]:
train_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Train02.csv', encoding='ISO-8859-1')
test_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Test02.csv', encoding='ISO-8859-1')
FeatureNames = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\FeatureNames02.csv', encoding='ISO-8859-1')

train_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')
test_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')

train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]
test_other_models = test_other_models[['id','Price_Log_Pred']]

train = pd.merge(train_orig, train_other_models, on='id')
test = pd.merge(test_orig, test_other_models, on='id')

train['Price_Log'] = np.log10(train['Price']+1)
train.hist(column='Price_Log')

FeatureNames = FeatureNames['x'].values.tolist()

In [None]:
def cleaning_text(review, remove_stopwords=False, Lem=False):
    review_text = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub('[^a-zA-Z]',' ', review_text)
    review_text = re.sub('\s+',' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if Lem:
        words = [stemmer.lemmatize(w) for w in words] # Lemmatization
    review_text = (' '.join([word for word in words]))
    return(review_text)

In [None]:
train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,True))
test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,True))

train['Title2'] = train['Title'].apply(lambda x: cleaning_text(x,True,True))
test['Title2'] = test['Title'].apply(lambda x: cleaning_text(x,True,True))

In [None]:
def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result

In [None]:
all_x1 = pd.concat([train.Synopsis2,test.Synopsis2])
all_x2 = pd.concat([train.Title2,test.Title2])

all_x1_w2v = labelize_text(all_x1, 'ALL')
all_x2_w2v = labelize_text(all_x2, 'ALL')

x_train1 = labelize_text(train.Synopsis2, 'TRAIN')
x_validation1 = labelize_text(test.Synopsis2, 'TEST')

x_train2 = labelize_text(train.Title2, 'TRAIN')
x_validation2 = labelize_text(test.Title2, 'TEST')

In [None]:
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Doc2Vec
import multiprocessing

In [None]:
%time
cores = multiprocessing.cpu_count()

model_dbow1 = Doc2Vec(dm=0, size=300, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dbow1.build_vocab([x for x in tqdm(all_x1_w2v)])
model_dbow1.train(utils.shuffle([x for x in tqdm(all_x1_w2v)]), total_examples=len(all_x1_w2v), epochs=1)

model_dbow2 = Doc2Vec(dm=0, size=300, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dbow2.build_vocab([x for x in tqdm(all_x2_w2v)])
model_dbow2.train(utils.shuffle([x for x in tqdm(all_x2_w2v)]), total_examples=len(all_x2_w2v), epochs=1)

In [None]:
%time
vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=1)
matrix1 = vectorizer1.fit_transform([x.words for x in all_x1_w2v])
tfidf1 = dict(zip(vectorizer1.get_feature_names(), vectorizer1.idf_))

vectorizer2 = TfidfVectorizer(analyzer=lambda x: x, min_df=1)
matrix2 = vectorizer2.fit_transform([x.words for x in all_x2_w2v])
tfidf2 = dict(zip(vectorizer2.get_feature_names(), vectorizer2.idf_))

In [None]:
def build_doc_Vector1(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dbow1[word].reshape((1, size)) * tfidf1[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_doc_Vector2(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dbow2[word].reshape((1, size)) * tfidf2[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
from sklearn.preprocessing import scale

vecs1_trn_dbow = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_train1))])
vecs1_trn_dbow = scale(vecs1_trn_dbow)
vecs1_tst_dbow = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_validation1))])
vecs1_tst_dbow = scale(vecs1_tst_dbow)

vecs2_trn_dbow = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_train2))])
vecs2_trn_dbow = scale(vecs2_trn_dbow)
vecs2_tst_dbow = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_validation2))])
vecs2_tst_dbow = scale(vecs2_tst_dbow)

In [None]:
vecs1_trn_dbow = pd.DataFrame(vecs1_trn_dbow,columns=['Synopsis_dbow_'+str(x) for x in range(1,300+1)])
vecs1_tst_dbow = pd.DataFrame(vecs1_tst_dbow,columns=['Synopsis_dbow_'+str(x) for x in range(1,300+1)])

vecs2_trn_dbow = pd.DataFrame(vecs2_trn_dbow,columns=['Title_dbow_'+str(x) for x in range(1,300+1)])
vecs2_tst_dbow = pd.DataFrame(vecs2_tst_dbow,columns=['Title_dbow_'+str(x) for x in range(1,300+1)])

In [None]:
%time
cores = multiprocessing.cpu_count()

model_dmc1 = Doc2Vec(dm=1, dm_concat=1, size=300, window=2, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmc1.build_vocab([x for x in tqdm(all_x1_w2v)])
model_dmc1.train(utils.shuffle([x for x in tqdm(all_x1_w2v)]), total_examples=len(all_x1_w2v), epochs=1)

model_dmc2 = Doc2Vec(dm=1, dm_concat=1, size=300, window=2, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmc2.build_vocab([x for x in tqdm(all_x2_w2v)])
model_dmc2.train(utils.shuffle([x for x in tqdm(all_x2_w2v)]), total_examples=len(all_x2_w2v), epochs=1)

In [None]:
def build_doc_Vector1(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmc1[word].reshape((1, size)) * tfidf1[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_doc_Vector2(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmc2[word].reshape((1, size)) * tfidf2[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
vecs1_trn_dmc = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_train1))])
vecs1_trn_dmc = scale(vecs1_trn_dmc)
vecs1_tst_dmc = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_validation1))])
vecs1_tst_dmc = scale(vecs1_tst_dmc)

vecs2_trn_dmc = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_train2))])
vecs2_trn_dmc = scale(vecs2_trn_dmc)
vecs2_tst_dmc = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_validation2))])
vecs2_tst_dmc = scale(vecs2_tst_dmc)

In [None]:
vecs1_trn_dmc = pd.DataFrame(vecs1_trn_dmc,columns=['Synopsis_dmc_'+str(x) for x in range(1,300+1)])
vecs1_tst_dmc = pd.DataFrame(vecs1_tst_dmc,columns=['Synopsis_dmc_'+str(x) for x in range(1,300+1)])

vecs2_trn_dmc = pd.DataFrame(vecs2_trn_dmc,columns=['Title_dmc_'+str(x) for x in range(1,300+1)])
vecs2_tst_dmc = pd.DataFrame(vecs2_tst_dmc,columns=['Title_dmc_'+str(x) for x in range(1,300+1)])

In [None]:
cores = multiprocessing.cpu_count()

model_dmm1 = Doc2Vec(dm=1, dm_mean=1, size=300, window=4, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm1.build_vocab([x for x in tqdm(all_x1_w2v)])
model_dmm1.train(utils.shuffle([x for x in tqdm(all_x1_w2v)]), total_examples=len(all_x1_w2v), epochs=1)

model_dmm2 = Doc2Vec(dm=1, dm_mean=1, size=300, window=4, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm2.build_vocab([x for x in tqdm(all_x2_w2v)])
model_dmm2.train(utils.shuffle([x for x in tqdm(all_x2_w2v)]), total_examples=len(all_x2_w2v), epochs=1)

In [None]:
def build_doc_Vector1(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmm1[word].reshape((1, size)) * tfidf1[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_doc_Vector2(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmm2[word].reshape((1, size)) * tfidf2[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
vecs1_trn_dmm = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_train1))])
vecs1_trn_dmm = scale(vecs1_trn_dmm)
vecs1_tst_dmm = np.concatenate([build_doc_Vector1(z, 300) for z in tqdm(map(lambda x: x.words, x_validation1))])
vecs1_tst_dmm = scale(vecs1_tst_dmm)

vecs2_trn_dmm = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_train2))])
vecs2_trn_dmm = scale(vecs2_trn_dmm)
vecs2_tst_dmm = np.concatenate([build_doc_Vector2(z, 300) for z in tqdm(map(lambda x: x.words, x_validation2))])
vecs2_tst_dmm = scale(vecs2_tst_dmm)

In [None]:
vecs1_trn_dmm = pd.DataFrame(vecs1_trn_dmm,columns=['Synopsis_dmm_'+str(x) for x in range(1,300+1)])
vecs1_tst_dmm = pd.DataFrame(vecs1_tst_dmm,columns=['Synopsis_dmm_'+str(x) for x in range(1,300+1)])

vecs2_trn_dmm = pd.DataFrame(vecs2_trn_dmm,columns=['Title_dmm_'+str(x) for x in range(1,300+1)])
vecs2_tst_dmm = pd.DataFrame(vecs2_tst_dmm,columns=['Title_dmm_'+str(x) for x in range(1,300+1)])

In [None]:
train = pd.concat([train,vecs1_trn_dbow,vecs2_trn_dbow,vecs1_trn_dmc,vecs2_trn_dmc,vecs1_trn_dmm,vecs2_trn_dmm], axis=1)
test = pd.concat([test,vecs1_tst_dbow,vecs2_tst_dbow,vecs1_tst_dmc,vecs2_tst_dmc,vecs1_tst_dmm,vecs2_tst_dmm], axis=1)
print("Train Shape : ",train.shape)
print("Test Shape : ",test.shape)

In [None]:
FeatureNames2 = list(vecs1_trn_dbow.columns) + list(vecs2_trn_dbow.columns) + list(vecs1_trn_dmc.columns) + list(vecs2_trn_dmc.columns) + list(vecs1_trn_dmm.columns) + list(vecs2_trn_dmm.columns)
FeatureNames2

In [None]:
fold_list = list(train.FOLD_NUM.unique())
fold_list.sort()
fold_list

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])
    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])
    
    param = {
    'learning_rate': 0.01,
    'max_depth': 14,
    'min_data_in_leaf': 3,
    'bagging_freq': 0,
    'bagging_fraction': 0.95,
    'feature_fraction': 0.2,
    'boost': 'gbdt',    
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 392
    }
    
    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)

    IterationNum = IterationNum + 1

In [None]:
print("CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))

In [None]:
training_cv_predictions = train[['id','FOLD_NUM','Price','Price_Log']]

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB']], on='id')
print(training_cv_predictions.head())
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB'] = sub_data_preds
test['Price_Log_Pred_LGB'].describe()

In [None]:
chk1 = pd.isnull(train[FeatureNames+FeatureNames2]).sum()
chk1[chk1 > 0]

In [None]:
chk1 = pd.isnull(test[FeatureNames+FeatureNames2]).sum()
chk1[chk1 > 0]

In [None]:
from sklearn.linear_model import Ridge
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    ridgereg = Ridge(alpha=0.45,normalize=True)
    ridgereg.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_RIDGE'] = ridgereg.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RIDGE'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = ridgereg.predict(test[FeatureNames+FeatureNames2].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + ridgereg.predict(test[FeatureNames+FeatureNames2].values)

    IterationNum = IterationNum + 1

print("RIDGE 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RIDGE']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_RIDGE'] = sub_data_preds
test['Price_Log_Pred_RIDGE'].describe()

In [None]:
from sklearn.linear_model import Lasso
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    lassoreg = Lasso(alpha=0.00001,normalize=True, max_iter=1e6)
    lassoreg.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_LASSO'] = lassoreg.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LASSO'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = lassoreg.predict(test[FeatureNames+FeatureNames2].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + lassoreg.predict(test[FeatureNames+FeatureNames2].values)

    IterationNum = IterationNum + 1
    
print("LASSO 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LASSO']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LASSO'] = sub_data_preds
test['Price_Log_Pred_LASSO'].describe()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    KNN = KNeighborsRegressor(n_neighbors = 10)
    KNN.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_KNN'] = KNN.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_KNN'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = KNN.predict(test[FeatureNames+FeatureNames2].values)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + KNN.predict(test[FeatureNames+FeatureNames2].values)

    IterationNum = IterationNum + 1

print("KNN 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_KNN']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_KNN'] = sub_data_preds
test['Price_Log_Pred_KNN'].describe()

In [None]:
from sklearn.ensemble import RandomForestRegressor
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values
    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values
    
    RF = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 3, max_features = 30, random_state = 412,
                               verbose = 0, max_depth = 40)
    RF.fit(temp_train_tf,temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_RF'] = RF.predict(temp_val_tf)
    
    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RF'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = RF.predict(test[FeatureNames+FeatureNames2])
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + RF.predict(test[FeatureNames+FeatureNames2])

    IterationNum = IterationNum + 1

In [None]:
print("RF 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RF']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_RF'] = sub_data_preds
test['Price_Log_Pred_RF'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191023_Stack21_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191023_Stack21_DS.csv", index=False)

In [None]:
feature_names_ensemble = list(training_cv_predictions.columns[4:])
feature_names_ensemble

In [None]:
import seaborn as sns
corr = training_cv_predictions[feature_names_ensemble + ['Price_Log']].corr()
ax = sns.heatmap(
    corr, 
    vmin=0, vmax=1, center=0.5,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = training_cv_predictions[training_cv_predictions['FOLD_NUM'] != fold_num]
    temp_val = training_cv_predictions[training_cv_predictions['FOLD_NUM'] == fold_num]
    
    ridgereg_e = Ridge(alpha=0.00001,normalize=True)
    ridgereg_e.fit(temp_train[feature_names_ensemble],temp_train['Price_Log'])
    
    temp_val['Price_Log_Pred_LGB_ENS'] = ridgereg_e.predict(temp_val[feature_names_ensemble])
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB_ENS'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = ridgereg_e.predict(test[feature_names_ensemble])
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + ridgereg_e.predict(test[feature_names_ensemble])

    IterationNum = IterationNum + 1

In [None]:
print("LGB ENS CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))

In [None]:
training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB_ENS']], on='id')
training_cv_predictions.head()

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB_ENS'] = sub_data_preds
test['Price_Log_Pred_LGB_ENS'].describe()

In [None]:
training_cv_predictions.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191023_Stack21_DS.csv", index=False)
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191023_Stack21_DS.csv", index=False)

In [None]:
submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')

sub_data_preds2 = (10**test['Price_Log_Pred_LGB_ENS'].values) - 1
print(pd.DataFrame(sub_data_preds2).describe())

submission['Price'] = sub_data_preds2
submission.to_excel('C:\\Kaggle\\BooksPrice\\Submissions\\20191023_Stack21_DS.xlsx', index=False)