## LSTM-Conv1D Experiment

#### Import Libraries

In [None]:
# Reload modules
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#supress warnings
import warnings
warnings.filterwarnings('ignore')

#numpy and pandas for data manipulation
import pandas as pd
import numpy as np
from numpy import median
from scipy.stats import norm
import re
import math

#matplotlib and seaborn for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set(style='darkgrid')

import plotly
from plotly.offline import iplot
import plotly.graph_objects as go

#file system management
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
import joblib

import keras
import tensorflow as tf
from keras import optimizers, callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Model
from keras.layers import Input, Dropout, Dense

from keras.layers import BatchNormalization, Dense, Dropout, Input, Embedding, LSTM, Flatten
from keras.layers import Conv1D
from keras.models import Model, Sequential
from keras.layers.merge import concatenate
from keras.preprocessing.sequence import pad_sequences
import pickle

seed = 42

#### Read Data

In [None]:
# Training data
df = pd.read_table('/content/drive/My Drive/data/train.tsv')

print('Training data shape: ', df.shape)

Training data shape:  (1482535, 8)


#### Processing Data

In [None]:
#stopwords without no, not, etc
STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [None]:
def handle_missing_values(input_data):
    """
    Fills the nan/missing values with 'missing' for text columns
    """
    input_data.fillna({'name': 'missing', 'item_description': 'missing', 'category_name': 'other'}, inplace=True)
    
    return input_data

In [None]:
# https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(sentence):
    """
    Remove emojis from the string
    """
    pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return pattern.sub(r'', sentence)

In [None]:
def remove_punctuation(sentence):
    """
    Remove all puntuations from the string
    """
    import string
    regular_punct = list(string.punctuation)
    
    for punc in regular_punct:
        if punc in sentence:
            sentence = sentence.replace(punc, ' ')

    return sentence.strip()

In [None]:
# https://www.appliedaicourse.com/
def decontracted(phrase):
    """
    Expand and create common english contractions in the text
    """
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

In [None]:
# https://www.appliedaicourse.com/
def process_text(input_data, cols):
    """
    Take the text columns and process the data. Expand contractions, use regex to remove symbols/numbers, remove emojis, punctuations
    and stopwords and convert text to lowercase
    """
    for col in cols:
        
        processed_data = []
        
        for sent in input_data[col].values:
            
            sent = decontracted(sent)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            sent = remove_emoji(sent)
            sent = remove_punctuation(sent)
            sent = ' '.join(e for e in sent.split() if e not in STOPWORDS)
            processed_data.append(sent.lower().strip())
            
        input_data[col] = processed_data
        
    return input_data

In [None]:
def process_category(input_data):
    """
    Split the category_name into 3 parts as category_0, category_1 and category_2
    """
    for i in range(3):
        
        def get_categories(ele):
            
            if type(ele) != str:
                return np.nan
        
            cat = ele.split('/')
            
            if i >= len(cat):
                return np.nan
            else:
                return cat[i]

        col_name = 'category_' + str(i)
        
        input_data[col_name] = input_data['category_name'].apply(get_categories)
        
        input_data.fillna({'category_name': 'Other'}, inplace = True)
    
    return input_data

In [None]:
#nlp features
def get_text_features(input_data):
    """
    NLP features derived from the text columns
    """
    input_data['has_brand_name'] = (input_data['brand_name'].isnull()).astype(np.int8) #if brand_name is present, 1 else 0
    
    input_data['has_price'] = np.where(input_data['item_description'].str.contains(' rm ', na = False), 1, 0) #if item_description has [rm] which is price string removed, 1 else 0

    input_data['reversed_item_condition_id'] = 5 - input_data['item_condition_id']

    luxury_brands = ["MCM", "MCM Worldwide", "Louis Vuitton", "Burberry", "Burberry London", "Burberry Brit", "HERMES", "Tieks",
                     "Rolex", "Apple", "Gucci", "Valentino", "Valentino Garavani", "RED Valentino", "Cartier", "Christian Louboutin",
                     "Yves Saint Laurent", "Saint Laurent", "YSL Yves Saint Laurent", "Georgio Armani", "Armani Collezioni", "Emporio Armani"]
    
    input_data['is_luxurious'] = (input_data['brand_name'].isin(luxury_brands)).astype(np.int8)

    expensive_brands = ["Michael Kors", "Louis Vuitton", "Lululemon", "LuLaRoe", "Kendra Scott", "Tory Burch", "Apple", "Kate Spade",
                  "UGG Australia", "Coach", "Gucci", "Rae Dunn", "Tiffany & Co.", "Rock Revival", "Adidas", "Beats", "Burberry",
                  "Christian Louboutin", "David Yurman", "Ray-Ban", "Chanel"]

    input_data['is_expensive'] = (input_data['brand_name'].isin(expensive_brands)).astype(np.int8)

    cheap_brands = ["FOREVER 21", "Old Navy", "Carter's", "Elmers", "NYX", "Maybelline", "Disney", "American Eagle", "PopSockets", "Wet n Wild", "Hollister", "Pokemon", "Hot Topic", "Konami",
                      "Charlotte Russe", "H&M", "e.l.f.", "Bath & Body Works", "Gap"]

    input_data['is_cheap'] = (input_data['brand_name'].isin(cheap_brands)).astype(np.int8)

    input_data['len_name'] = input_data['name'].str.len()
    input_data['len_item_description'] = input_data['item_description'].str.len()
    input_data['len'] = input_data['len_name'] + input_data['len_item_description']

    input_data['token_count_name'] = input_data['name'].apply(lambda x: len(x.split(' ')))
    input_data['token_count_item_description'] = input_data['item_description'].apply(lambda x: len(x.split(' ')))
    input_data['token_count'] = input_data['token_count_name'] + input_data['token_count_item_description']
    input_data['token_count_ratio'] = input_data['token_count_name']/input_data['token_count_item_description']

    input_data["name_words"] = input_data["name"].str.count("(\s|^)[a-z]+(\s|$)")
    input_data["item_description_words"] = input_data["item_description"].str.count("(\s|^)[a-z]+(\s|$)")
    input_data["words"] = input_data["name_words"] + input_data["item_description_words"]

    input_data["name_numbers"] = input_data["name"].str.count("(\s|^)[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?(\s|$)")
    input_data["item_description_numbers"] = input_data["item_description"].str.count("(\s|^)[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?(\s|$)")
    input_data["numbers"] = input_data["name_numbers"] + input_data["item_description_numbers"]

    input_data["name_letters"] = input_data["name"].str.count("[a-zA-Z]")
    input_data["item_description_letters"] = input_data["item_description"].str.count("[a-zA-Z]")
    input_data["letters"] = input_data["name_letters"] + input_data["item_description_letters"]

    input_data["name_digits"] = input_data["name"].str.count("[0-9]")
    input_data["item_description_digits"] = input_data["item_description"].str.count("[0-9]")
    input_data["digits"] = input_data["name_digits"] + input_data["item_description_digits"]

    return input_data

In [None]:
def preprocess(input_data):
    """
    Process the data by handling missing values, process category_name, process text
    """
    input_data = input_data[(input_data['price'] >= 3) & (input_data['price'] <= 2000)]
    
    input_data['price'] = np.log1p(input_data['price'])

    input_data = handle_missing_values(input_data)
    
    #input_data = process_category(input_data)
    
    input_data = process_text(input_data, ['name', 'item_description', 'category_name'])

    return input_data

In [None]:
data = preprocess(df)

#data.fillna({'category_0': 'other', 'category_1': 'other', 'category_2': 'other'}, inplace = True)

In [None]:
#NLP features
data = get_text_features(data)

data.fillna({'brand_name': ' '}, inplace = True)

#concatenate text features
data['name'] = data['name'] + ' ' + data['brand_name'] + ' ' + data['category_name']
data['text'] = data['name'] + ' ' + data['item_description']

data = data.drop(columns = ['brand_name', 'item_description', 'category_name'], axis = 1)

#### Modeling

In [None]:
"""
Taking necessary features for modeling
"""
data = data[['price', 'name', 'shipping', 'item_condition_id', 'is_expensive', 'is_luxurious', 'text']]

In [None]:
"""
Split the dataset into train and test
"""
y = data['price']

X = data.drop('price', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

In [None]:
def get_rmsle(y_true, y_pred):
    """
    Get RMSLE score by passing actual values and the predictions from models
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

## Deep Learning

#### Name

In [None]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
from keras.preprocessing.text import Tokenizer

t = Tokenizer(num_words = 1000)
t.fit_on_texts(X_train['name'].tolist())

seq_tr = t.texts_to_sequences(X_train['name'])
seq_te = t.texts_to_sequences(X_test['name'])

wordIdx = t.word_index

max_length = 350
encoded_tr = pad_sequences(seq_tr, maxlen=max_length, padding='post')
encoded_te = pad_sequences(seq_te, maxlen=max_length, padding='post')

In [None]:
# Loading Embedding File
f = open('/content/drive/My Drive/data/glove_vectors', 'rb')
glove_words = pickle.load(f)

In [None]:
MAX_VOCAB_SIZE = 1000
num_words = min(MAX_VOCAB_SIZE, len(wordIdx) + 1)
embedding_matrix = np.zeros((num_words, 300))
for word, i in wordIdx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = glove_words.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all zeros.
          embedding_matrix[i] = embedding_vector

In [None]:
MAX_SEQUENCE_LENGTH = 350
embedding_layer = Embedding(num_words, 300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

input_text = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_text)
x = LSTM(4, return_sequences=True)(x)
flatten_1 = Flatten()(x)

#### Text

In [None]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
from keras.preprocessing.text import Tokenizer

t = Tokenizer(num_words = 1000)
t.fit_on_texts(X_train['text'].tolist())

seq_tr = t.texts_to_sequences(X_train['text'])
seq_te = t.texts_to_sequences(X_test['text'])

wordIdx = t.word_index

max_length = 350
encoded_text_tr = pad_sequences(seq_tr, maxlen=max_length, padding='post')
encoded_text_te = pad_sequences(seq_te, maxlen=max_length, padding='post')

In [None]:
# Loading Embedding File
f = open('/content/drive/My Drive/data/glove_vectors', 'rb')
glove_words = pickle.load(f)

In [None]:
MAX_VOCAB_SIZE = 1000
num_words = min(MAX_VOCAB_SIZE, len(wordIdx) + 1)
embedding_matrix = np.zeros((num_words, 300))
for word, i in wordIdx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = glove_words.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros.
            embedding_matrix[i] = embedding_vector

In [None]:
MAX_SEQUENCE_LENGTH = 350
embedding_layer = Embedding(num_words, 300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

input_text_tr = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_text)
x = LSTM(16, return_sequences=True)(x)
flatten_2 = Flatten()(x)

#### shipping, item_condition_id, is_expensive, is_luxurious

In [None]:
# Now we will prepare numerical features for our model
num_tr_ship = X_train['shipping'].values.reshape(-1, 1)
num_tr_cond = X_train['item_condition_id'].values.reshape(-1, 1)
num_tr_exp = X_train['is_expensive'].values.reshape(-1, 1)
num_tr_lux = X_train['is_luxurious'].values.reshape(-1, 1)

num_te_ship = X_test['shipping'].values.reshape(-1, 1)
num_te_cond = X_test['item_condition_id'].values.reshape(-1, 1)
num_te_exp = X_test['is_expensive'].values.reshape(-1, 1)
num_te_lux = X_test['is_luxurious'].values.reshape(-1, 1)

num_train = np.concatenate((num_tr_ship, num_tr_cond, num_tr_exp, num_tr_lux), axis=1)
num_test = np.concatenate((num_te_ship, num_te_cond, num_te_exp, num_te_lux), axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
norm_train = scaler.fit_transform(num_train)
norm_test = scaler.transform(num_test)

In [None]:
expand_tr = np.expand_dims(norm_train, 2)
expand_te = np.expand_dims(norm_test, 2)

#### Model

In [None]:
"""
Early Stopping is to stop training when a monitored metric has stopped improving

monitor -  Quantity to be monitored
patience - Number of epochs with no improvement
"""
callbacks = [
    EarlyStopping(monitor = 'val_loss', patience = 3)
]

In [None]:
#https://keras.io/layers/convolutional/

inp_conv = Input(shape=(4, 1))

x1 = Conv1D(filters=128, kernel_size=2, activation='relu')(inp_conv)
x1 = Conv1D(filters=128, kernel_size=2, activation='relu')(x1)
x2 = Flatten()(x1)

In [None]:
x_concatenate = concatenate([flatten_1, flatten_2, x2])

x = Dense(32, activation="relu")(x_concatenate)

x = Dense(32, activation="relu")(x)

x = Dense(16, activation="relu")(x)

output = Dense(1)(x)

model = Model(inputs=[input_text, input_text_tr, inp_conv], outputs=[output])

In [None]:
train_data = [encoded_tr, encoded_text_tr, expand_tr]
test_data = [encoded_te, encoded_text_te, expand_te]

In [None]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 350)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 4, 1)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 350, 300)     300000      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 350, 300)     300000      input_1[0][0]                    
_______________________________________________________________________________________

In [None]:
model.compile(optimizer='Adam', loss = 'mean_squared_error')

In [None]:
model.fit(train_data, y_train, batch_size=64, epochs=20, validation_data=(test_data, y_test), callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


<tensorflow.python.keras.callbacks.History at 0x7f83c0225a20>

In [37]:
predictions = model.predict(test_data)

print(get_rmsle(np.expm1(y_test), np.expm1(predictions)))

0.5133484696910033


### Observation/Conclusions
- Test RMSLE with LSTM-Conv1D and Word2Vec word embeddings came out to be 0.513
- This model does not overfit on validation set

In [2]:
###

In [3]:
###