In [None]:
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import lightgbm as lgb
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
from scipy import sparse
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt
import gc
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import itertools
from skopt import gp_minimize

In [None]:
def cleaning_text(review, remove_stopwords=False, Lem=False):
    review_text = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub('[^a-zA-Z]',' ', review_text)
    review_text = re.sub('\s+',' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if Lem:
        words = [stemmer.lemmatize(w) for w in words] # Lemmatization
    review_text = (' '.join([word for word in words]))
    return(review_text)

In [None]:
train_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Train02.csv', encoding='ISO-8859-1')
test_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Test02.csv', encoding='ISO-8859-1')
FeatureNames = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\FeatureNames02.csv', encoding='ISO-8859-1')

train_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')
test_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')

In [None]:
train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]
test_other_models = test_other_models[['id','Price_Log_Pred']]

train = pd.merge(train_orig, train_other_models, on='id')
test = pd.merge(test_orig, test_other_models, on='id')

In [None]:
train['Price_Log'] = np.log10(train['Price']+1)
train.hist(column='Price_Log')

In [None]:
FeatureNames = FeatureNames['x'].values.tolist()
FeatureNames

In [None]:
train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,False))
test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,False))

In [None]:
%%time
vectorizer = CountVectorizer(max_features=2000, min_df=3, max_df=0.4, ngram_range=(1,3))
vectorizer.fit(train['Synopsis2'])

test_tf = vectorizer.transform(test['Synopsis2'])
test_tf = sparse.hstack((test_tf,sparse.csr_matrix(np.asmatrix(test[FeatureNames].values))))
print(test_tf.shape)

In [None]:
fold_list = list(train.FOLD_NUM.unique())
fold_list.sort()
fold_list

In [None]:
param = {
    'learning_rate': 0.001,
    'max_depth': 14,
    'min_data_in_leaf': 3,
    'bagging_freq': 5,
    'bagging_fraction': 0.95,
    'feature_fraction': 0.1,
    'boost': 'gbdt',    
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 392
}
#CV RMSLE =  0.22106535470883548
#CV 1-RMSLE =  0.7789346452911645

In [None]:
IterationNum = 1
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = vectorizer.transform(temp_train['Synopsis2'])
    temp_val_tf = vectorizer.transform(temp_val['Synopsis2'])
    
    temp_train_tf = sparse.hstack((temp_train_tf,sparse.csr_matrix(np.asmatrix(temp_train[FeatureNames].values))))
    temp_val_tf = sparse.hstack((temp_val_tf,sparse.csr_matrix(np.asmatrix(temp_val[FeatureNames].values))))
    
    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])
    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])
    
    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)
    
    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)
    
    print("Fold RMSLE = ",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = model.predict(test_tf, num_iteration=model.best_iteration)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        sub_data_preds = sub_data_preds + model.predict(test_tf, num_iteration=model.best_iteration)

    IterationNum = IterationNum + 1

In [None]:
print("CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))

In [None]:
CV_SCORED_DATA.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191003_LGB04_DS.csv", index=False)

In [None]:
sub_data_preds = sub_data_preds / len(fold_list)
test['Price_Log_Pred_LGB'] = sub_data_preds
test['Price_Log_Pred_LGB'].describe()

In [None]:
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191003_LGB04_DS.csv", index=False)

In [None]:
submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')

In [None]:
sub_data_preds2 = (10**sub_data_preds) - 1
pd.DataFrame(sub_data_preds2).describe()

In [None]:
submission['Price'] = sub_data_preds2

In [None]:
submission.to_excel('C:\\Kaggle\\BooksPrice\\Submissions\\20191003_LGB04_DS.xlsx', index=False)