In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, GlobalMaxPool1D, Conv1D, Dropout, GlobalAveragePooling1D, Flatten, concatenate, Input
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.layers.normalization import BatchNormalization
import re
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import regularizers
from keras.models import Model
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sb
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
train_orig = pd.read_csv('Data_Train02.csv', encoding='ISO-8859-1')
test_orig = pd.read_csv('Data_Test02.csv', encoding='ISO-8859-1')
FeatureNames = pd.read_csv('FeatureNames02.csv', encoding='ISO-8859-1')

train_other_models = pd.read_csv('20190930_XGB01_TRN_DS.csv', encoding='ISO-8859-1')
test_other_models = pd.read_csv('20190930_XGB01_TST_DS.csv', encoding='ISO-8859-1')

train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]
test_other_models = test_other_models[['id','Price_Log_Pred']]

train = pd.merge(train_orig, train_other_models, on='id')
test = pd.merge(test_orig, test_other_models, on='id')

train['Price_Log'] = np.log10(train['Price']+1)
train.hist(column='Price_Log')

In [None]:
FeatureNames = FeatureNames['x'].values.tolist()

In [None]:
def cleaning_text(review, remove_stopwords=False, Lem=False):
    review_text = BeautifulSoup(review, "html.parser").get_text()
    review_text = re.sub('\W',' ', review_text) # remove all the special characters
    review_text = re.sub('[^\w\s]',' ', review_text) # Removing Punctuation
    review_text = re.sub('\s+[a-zA-Z]\s+',' ', review_text) # remove all single characters
    review_text = re.sub('\^[a-zA-Z]\s+',' ', review_text) # remove single characters from the start
    review_text = re.sub('\s+',' ', review_text) # Substituting multiple spaces with single space
    review_text = re.sub('[^a-zA-Z]','', review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if Lem:
        words = [stemmer.lemmatize(w) for w in words] # Lemmatization
    review_text = (' '.join([word for word in words]))
    return(review_text)

In [None]:
train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,False))
test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,False))

train['Title2'] = train['Title'].apply(lambda x: cleaning_text(x,True,False))
test['Title2'] = test['Title'].apply(lambda x: cleaning_text(x,True,False))

In [None]:
max_fatures_syn = 3000
tokenizer_syn = Tokenizer(num_words=max_fatures_syn, split=' ')
tokenizer_syn.fit_on_texts(train['Synopsis2'].values)

word_index_syn = tokenizer_syn.word_index
print('Found %s unique tokens.' % len(word_index_syn))

In [None]:
max_fatures_tit = 3000
tokenizer_tit = Tokenizer(num_words=max_fatures_tit, split=' ')
tokenizer_tit.fit_on_texts(train['Title2'].values)

word_index_tit = tokenizer_tit.word_index
print('Found %s unique tokens.' % len(word_index_tit))

In [None]:
def nn_model(nlp_input_shape,meta_input_shape,max_fatures,dim,nlp_input_shape2,max_fatures2,dim2):
    nlp_input = Input(shape=(nlp_input_shape,), name='nlp_input')
    meta_input = Input(shape=(meta_input_shape,), name='meta_input')
    nlp_input2 = Input(shape=(nlp_input_shape2,), name='nlp_input2')
    
    emb = Embedding(max_fatures, dim, input_length = nlp_input_shape)(nlp_input)
    nlp_out1 = Conv1D(200, 3, padding='valid', activation='relu', strides=1)(emb)
    nlp_out = Conv1D(200, 5, padding='valid', activation='relu', strides=1)(nlp_out1)
    nlp_out = GlobalMaxPool1D()(nlp_out)
    
    emb2 = Embedding(max_fatures2, dim2, input_length = nlp_input_shape2)(nlp_input2)
    nlp_out12 = Conv1D(200, 3, padding='valid', activation='relu', strides=1)(emb2)
    nlp_out2 = Conv1D(200, 5, padding='valid', activation='relu', strides=1)(nlp_out12)
    nlp_out2 = GlobalMaxPool1D()(nlp_out2)
    
    x = concatenate([nlp_out, meta_input, nlp_out2])
    x = BatchNormalization()(x)
    x = Dense(100, activation='linear')(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(50, activation='linear')(x)
    x = Dropout(0.1)(x)
    x = BatchNormalization()(x)
    x = Dense(1)(x)
    
    model = Model(inputs=[nlp_input , meta_input, nlp_input2], outputs=[x])
    return(model)

In [None]:
META_FEATURES_VARNAMES = FeatureNames

fold_list = list(train.FOLD_NUM.unique())
fold_list.sort()
fold_list

In [None]:
keras01_Models = []
batch_size = 64
for fold_num in fold_list:
    print("Running for : ",fold_num)
    
    temp_train = train[train['FOLD_NUM'] != fold_num]
    temp_val = train[train['FOLD_NUM'] == fold_num]

    X_temp_train = tokenizer_syn.texts_to_sequences(temp_train['Synopsis2'].values)
    X_temp_train = pad_sequences(X_temp_train, maxlen=100, padding='post')

    X_temp_val = tokenizer_syn.texts_to_sequences(temp_val['Synopsis2'].values)
    X_temp_val = pad_sequences(X_temp_val, maxlen=100, padding='post')

    X_temp_train_meta = np.array(temp_train[META_FEATURES_VARNAMES])
    X_temp_val_meta = np.array(temp_val[META_FEATURES_VARNAMES])
    
    X_temp_train2 = tokenizer_tit.texts_to_sequences(temp_train['Title2'].values)
    X_temp_train2 = pad_sequences(X_temp_train2, maxlen=100, padding='post')

    X_temp_val2 = tokenizer_tit.texts_to_sequences(temp_val['Title2'].values)
    X_temp_val2 = pad_sequences(X_temp_val2, maxlen=100, padding='post')

    Y_temp_train = temp_train['Price_Log'].values
    Y_temp_val = temp_val['Price_Log'].values

    print("Y train/val Shapes : ",Y_temp_train.shape,Y_temp_val.shape)
    print("X train/val Shapes : ",X_temp_train.shape,X_temp_val.shape)

    model = nn_model(X_temp_train.shape[1],X_temp_train_meta.shape[1],max_fatures_syn,300,X_temp_train2.shape[1],max_fatures_tit,300)
    model.compile(loss = 'mean_squared_error', optimizer='adam',metrics = ['mean_squared_error'])
    print(model.summary())

    file_name = "20190930_Keras01_Concat_Model_Weights_Fold_"+str(fold_num)+'.h5'
    final_path = file_name
    print("Model Weights File Name : ",final_path)
    keras01_Models.append(final_path)

    es = EarlyStopping(mode='min',
                       verbose=1,
                       patience=10)
    checkpointer = ModelCheckpoint(filepath=final_path,
                                   mode='min',
                                   verbose=1,
                                   save_best_only=True)
    reduce_lr = ReduceLROnPlateau(factor=0.2,
                                  patience=3,
                                  min_lr=0.0000001,
                                  verbose=1)
    history = model.fit([X_temp_train,X_temp_train_meta,X_temp_train2], Y_temp_train,
                        epochs = 200,
                        batch_size = batch_size,
                        verbose = 1,
                        validation_data = ([X_temp_val,X_temp_val_meta,X_temp_val2], Y_temp_val),
                        callbacks = [es,checkpointer])

    print("Loading Model for Prediction")

    loaded_model = load_model(final_path)
    Y_temp_val_pred = loaded_model.predict([X_temp_val,X_temp_val_meta,X_temp_val2])

    temp_val['Price_Log_Pred_Keras'] = Y_temp_val_pred

    print('Completed for Fold - ',fold_num)
    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_Keras'])))
    
    if fold_num == 1:
        training_cv_preds_keras01 = temp_val
        training_cv_preds_keras01.reset_index(drop = True, inplace = True)
    else:
        training_cv_preds_keras01 = pd.concat([training_cv_preds_keras01,temp_val])
        training_cv_preds_keras01.reset_index(drop = True, inplace = True)

In [None]:
print(keras01_Models)
print("Keras 01 CV RMSLE = ",sqrt(mean_squared_error(training_cv_preds_keras01['Price_Log'], training_cv_preds_keras01['Price_Log_Pred_Keras'])))

In [None]:
training_cv_preds_keras01.to_csv("20190930_Keras01_TRN_CV_DS.csv", index=False)

In [None]:
%%time
X_test = tokenizer_syn.texts_to_sequences(test['Synopsis2'].values)
X_test = pad_sequences(X_test, maxlen=100, padding='post')
X_test_meta = np.array(test[META_FEATURES_VARNAMES])
X_test2 = tokenizer_tit.texts_to_sequences(test['Title2'].values)
X_test2 = pad_sequences(X_test2, maxlen=100, padding='post')

test_preds = np.zeros((test.shape[0],1))

for fname in keras01_Models:
    print("Running for : ",fname)
    loaded_model = load_model(fname)
    Y_test = loaded_model.predict([X_test,X_test_meta,X_test2])
    test_preds = test_preds + Y_test
    
test_preds = test_preds / len(keras01_Models)

test['Price_Log_Pred_Keras'] = test_preds
test.to_csv("20190930_Keras01_TST_CV_DS.csv", index=False)

In [None]:
submission = pd.read_excel('Sample_Submission.xlsx', encoding='ISO-8859-1')

In [None]:
test_preds2 = (10**test_preds) - 1
pd.DataFrame(test_preds2).describe()

In [None]:
submission['Price'] = test_preds2
submission.to_excel('20190930_Keras01_DS.xlsx', index=False)