In [None]:
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
from scipy import sparse
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.decomposition import TruncatedSVD

In [None]:
def cleaning_text(review, remove_stopwords=False, Lem=False):
    review_text = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub('[^a-zA-Z]',' ', review_text)
    review_text = re.sub('\s+',' ', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if Lem:
        words = [stemmer.lemmatize(w) for w in words] # Lemmatization
    review_text = (' '.join([word for word in words]))
    return(review_text)

In [None]:
train_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Train02.csv', encoding='ISO-8859-1')
test_orig = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\Data_Test02.csv', encoding='ISO-8859-1')
FeatureNames = pd.read_csv('C:\\Kaggle\\BooksPrice\\Participants_Data\\FeatureNames02.csv', encoding='ISO-8859-1')

train_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')
test_other_models = pd.read_csv('C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')

train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]
test_other_models = test_other_models[['id','Price_Log_Pred']]

train = pd.merge(train_orig, train_other_models, on='id')
test = pd.merge(test_orig, test_other_models, on='id')

train['Price_Log'] = np.log10(train['Price']+1)
train.hist(column='Price_Log')

In [None]:
FeatureNames = FeatureNames['x'].values.tolist()
FeatureNames

In [None]:
FNAMES_MAX_VECTOR = train[FeatureNames].max()
test[FeatureNames] = test[FeatureNames] / FNAMES_MAX_VECTOR
train[FeatureNames] = train[FeatureNames] / FNAMES_MAX_VECTOR

In [None]:
#MAX_PRICE_LOG = train['Price_Log'].max()
MAX_PRICE_LOG = 1
print("Maximum Price Log10 : ",MAX_PRICE_LOG)
train['Price_Log'] = train['Price_Log'] / MAX_PRICE_LOG

In [None]:
train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,False))
test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,False))

In [None]:
%%time
vectorizer = CountVectorizer(max_features=2000, min_df=3, max_df=0.7, ngram_range=(1,3))
vectorizer.fit(train['Synopsis2'])

train_tf_for_pca = vectorizer.transform(train['Synopsis2'])

In [None]:
svd = TruncatedSVD(n_components=300, n_iter=10, random_state=42)
svd.fit(train_tf_for_pca)

In [None]:
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

In [None]:
fold_list = list(train.FOLD_NUM.unique())
fold_list.sort()
fold_list

In [None]:
def nn_model(input_shape):
    model = Sequential()
    model.add(Dense(300, input_dim=input_shape, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(150, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())
    model.add(Dense(1))
    return(model)

In [None]:
keras01_Models = []
IterationNum = 1
batch_size = 64
for fold_num in fold_list:
    print("Running CV Iteration Num :", IterationNum)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]
    
    temp_train_tf = vectorizer.transform(temp_train['Synopsis2'])
    temp_val_tf = vectorizer.transform(temp_val['Synopsis2'])
    
    temp_train_tf = svd.transform(temp_train_tf)
    temp_val_tf = svd.transform(temp_val_tf)
    
    temp_train_tf = sparse.hstack((temp_train_tf,sparse.csr_matrix(np.asmatrix(temp_train[FeatureNames].values))))
    temp_val_tf = sparse.hstack((temp_val_tf,sparse.csr_matrix(np.asmatrix(temp_val[FeatureNames].values))))
    
    temp_train_tf = temp_train_tf.toarray()
    temp_val_tf = temp_val_tf.toarray()
    
    model = nn_model(temp_train_tf.shape[1])
    model.compile(loss = 'mean_squared_error', optimizer='adam',metrics = ['mean_squared_error'])
    print(model.summary())
    
    model_weights_save_path = 'C:/Kaggle/BooksPrice/KerasModels/'
    file_name = "201901001_Keras03_Model_Weights_Fold_"+str(fold_num)+'.h5'
    final_path = model_weights_save_path+file_name
    print("Model Weights File Name : ",final_path)
    keras01_Models.append(final_path)
    
    es = EarlyStopping(mode='min',
                       verbose=1,
                       patience=10)
    
    checkpointer = ModelCheckpoint(filepath=final_path,
                                   mode='min',
                                   verbose=1,
                                   save_best_only=True)
    
    history = model.fit(temp_train_tf, temp_train['Price_Log'].values,
                        epochs = 500,
                        batch_size = batch_size,
                        verbose = 1,
                        shuffle = 'batch',
                        validation_data = (temp_val_tf, temp_val['Price_Log'].values),
                        callbacks = [es,checkpointer])
    
    
    loaded_model = load_model(final_path)
    Y_temp_val_pred = loaded_model.predict(temp_val_tf)
    Y_temp_val_pred = Y_temp_val_pred * MAX_PRICE_LOG
    temp_val['Price_Log_Pred_Keras'] = Y_temp_val_pred
    temp_val['Price_Log'] = temp_val['Price_Log'] * MAX_PRICE_LOG

    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_Keras'])))
    
    if(IterationNum == 1):
        CV_SCORED_DATA = temp_val.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)

    IterationNum = IterationNum + 1

In [None]:
print(keras01_Models)
print("Keras 01 CV RMSLE = ",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))
print("CV 1-RMSLE = ",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_Keras'])))
# LB SCORE : 0.7679

In [None]:
CV_SCORED_DATA.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Trn Datasets\\20191003_Keras03_DS.csv", index=False)

In [None]:
%%time
test_tf = vectorizer.transform(test['Synopsis2'])
test_tf = svd.transform(test_tf)
test_tf = sparse.hstack((test_tf,sparse.csr_matrix(np.asmatrix(test[FeatureNames].values))))
print(test_tf.shape)

test_tf = test_tf.toarray()

test_preds = np.zeros((test.shape[0],1))

for fname in keras01_Models:
    print("Running for : ",fname)
    loaded_model = load_model(fname)
    test_preds = test_preds + (loaded_model.predict(test_tf) * MAX_PRICE_LOG)
    
test_preds = test_preds / len(keras01_Models)

test['Price_Log_Pred_Keras'] = test_preds
test.to_csv("C:\\Kaggle\\BooksPrice\\CV Scrd Tst Datasets\\20191003_Keras03_DS.csv", index=False)

In [None]:
test['Price_Log_Pred_Keras'].describe()

In [None]:
submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')

In [None]:
test_preds2 = (10**test_preds) - 1
pd.DataFrame(test_preds2).describe()

In [None]:
submission['Price'] = test_preds2
submission.to_excel('C:\\Kaggle\\BooksPrice\\Submissions\\20191003_Keras03_DS.xlsx', index=False)