## using lstm to text data classification

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import re
from nltk.corpus import stopwords
import keras.preprocessing.text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import keras
# from keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D, SpatialDropout1D, Embedding, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping

In [2]:
# train = pd.read_csv("/kaggle/input/uw-cs480-fall20/train.csv")
# test = pd.read_csv("/kaggle/input/uw-cs480-fall20/test.csv")

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

data = pd.DataFrame(pd.concat([train, test]))

# text processing

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 

    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

# put gender/color etc info into 'text'
data["text"] = data['gender'] + " " + data["baseColour"]+ " " + data["season"]+ " " + data["usage"] + " "+ data['noisyTextDescription']

data['TextDescription'] = data['text'].apply(clean_text)
data['TextDescription'] = data['TextDescription'].str.replace('\d+', '')
# data.head()

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 20
# This is fixed.
EMBEDDING_DIM = 200

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['TextDescription'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(data['TextDescription'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)


Y = pd.get_dummies(data['category']).values
print('Shape of label tensor:', Y.shape)

Found 7200 unique tokens.
Shape of data tensor: (43255, 20)
Shape of label tensor: (43255, 27)


In [13]:
pd.get_dummies(data['category']).columns

Index(['Accessories', 'Apparel Set', 'Bags', 'Belts', 'Bottomwear',
       'Cufflinks', 'Dress', 'Eyewear', 'Flip Flops', 'Fragrance',
       'Free Gifts', 'Headwear', 'Innerwear', 'Jewellery', 'Lips',
       'Loungewear and Nightwear', 'Makeup', 'Nails', 'Sandal', 'Saree',
       'Scarves', 'Shoes', 'Socks', 'Ties', 'Topwear', 'Wallets', 'Watches'],
      dtype='object')

In [9]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # loading
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [3]:
X_train_total, X_test = pd.DataFrame(X[:len(train)]), pd.DataFrame(X[len(train):])
Y_train_total, Y_test = pd.DataFrame(Y[:len(train)]), pd.DataFrame(Y[len(train):])

val_size = int(len(train)/5)

X_train1 = X_train_total[val_size:]
X_val1 = X_train_total[:val_size]

X_train2 = pd.concat([X_train_total.iloc[:val_size,], X_train_total.iloc[val_size*2:]])
X_val2 = X_train_total[val_size:val_size*2]

X_train3 = pd.concat([X_train_total.iloc[:val_size*2,], X_train_total.iloc[val_size*3:]])
X_val3 = X_train_total[val_size*2:val_size*3]

X_train4 = pd.concat([X_train_total.iloc[:val_size*3,], X_train_total.iloc[val_size*4:]])
X_val4 = X_train_total[val_size*3:val_size*4]

X_train5 = X_train_total.iloc[:val_size*4,]
X_val5 = X_train_total[val_size*4:]

Y_train1 = Y_train_total[val_size:]
Y_val1 = Y_train_total[:val_size]

Y_train2 = pd.concat([Y_train_total.iloc[:val_size,], Y_train_total.iloc[val_size*2:]])
Y_val2 = Y_train_total[val_size:val_size*2]

Y_train3 = pd.concat([Y_train_total.iloc[:val_size*2,], Y_train_total.iloc[val_size*3:]])
Y_val3 = Y_train_total[val_size*2:val_size*3]

Y_train4 = pd.concat([Y_train_total.iloc[:val_size*3,], Y_train_total.iloc[val_size*4:]])
Y_val4 = Y_train_total[val_size*3:val_size*4]

Y_train5 = Y_train_total.iloc[:val_size*4,]
Y_val5 = Y_train_total[val_size*4:]

print(Y_train5.shape,Y_val5.shape)
# print(X_test.shape,Y_test.shape)

(17300, 27) (4327, 27)


In [13]:
def train(X_train, Y_train, X_val, Y_val, k):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(27, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 3 
    batch_size = 64

    # create a callback that will save the best model while training
    save_best_model = ModelCheckpoint("best_model_" + str(k) + '.h5', monitor='val_accuracy', mode='max', save_best_only=True, verbose=1)

    history = model.fit(X_train, Y_train, batch_size=batch_size, \
                        epochs=epochs,validation_data=(X_val, Y_val),shuffle=True,callbacks=[save_best_model])
    
    return model

In [12]:
model_1 = train(X_train1, Y_train1, X_val1, Y_val1, 1)
saved_model1 =  load_model('best_model_1.h5')

scores_cnn = saved_model1.evaluate(X_val1, Y_val1, verbose=1)
X_pred_1 = saved_model1.predict(X_val1)

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.87630, saving model to best_model_1.h5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.87630 to 0.89757, saving model to best_model_1.h5
Epoch 3/10
Epoch 00003: val_accuracy improved from 0.89757 to 0.90058, saving model to best_model_1.h5
Epoch 4/10
Epoch 00004: val_accuracy did not improve from 0.90058
Epoch 5/10
Epoch 00005: val_accuracy did not improve from 0.90058
Epoch 6/10
Epoch 00006: val_accuracy did not improve from 0.90058
Epoch 7/10
Epoch 00007: val_accuracy did not improve from 0.90058
Epoch 8/10
Epoch 00008: val_accuracy did not improve from 0.90058
Epoch 9/10
Epoch 00009: val_accuracy did not improve from 0.90058
Epoch 10/10
Epoch 00010: val_accuracy did not improve from 0.90058


In [14]:
model_2 = train(X_train2, Y_train2, X_val2, Y_val2, 2)
saved_model2 =  load_model('best_model_2.h5')
scores_cnn = saved_model2.evaluate(X_val2, Y_val2, verbose=2)
X_pred_2 = saved_model2.predict(X_val2)

Epoch 1/3
Epoch 00001: val_accuracy improved from -inf to 0.86844, saving model to best_model_2.h5
Epoch 2/3
Epoch 00002: val_accuracy improved from 0.86844 to 0.90150, saving model to best_model_2.h5
Epoch 3/3
Epoch 00003: val_accuracy improved from 0.90150 to 0.90936, saving model to best_model_2.h5
136/136 - 1s - loss: 0.3638 - accuracy: 0.9094


In [15]:
model_3 = train(X_train3, Y_train3, X_val3, Y_val3, 3)
saved_model3 =  load_model('best_model_3.h5')
scores_cnn = saved_model3.evaluate(X_val3, Y_val3, verbose=3)
X_pred_3 = saved_model3.predict(X_val3)

Epoch 1/3
Epoch 00001: val_accuracy improved from -inf to 0.87052, saving model to best_model_3.h5
Epoch 2/3
Epoch 00002: val_accuracy improved from 0.87052 to 0.88994, saving model to best_model_3.h5
Epoch 3/3
Epoch 00003: val_accuracy improved from 0.88994 to 0.90405, saving model to best_model_3.h5


In [16]:
model_4 = train(X_train4, Y_train4, X_val4, Y_val4, 4)
saved_model4 =  load_model('best_model_4.h5')
scores_cnn = saved_model4.evaluate(X_val4, Y_val4, verbose=4)
X_pred_4 = saved_model4.predict(X_val4)

Epoch 1/3
Epoch 00001: val_accuracy improved from -inf to 0.87168, saving model to best_model_4.h5
Epoch 2/3
Epoch 00002: val_accuracy improved from 0.87168 to 0.89642, saving model to best_model_4.h5
Epoch 3/3
Epoch 00003: val_accuracy improved from 0.89642 to 0.90382, saving model to best_model_4.h5


In [17]:
model_5 = train(X_train5, Y_train5, X_val5, Y_val5, 5)
saved_model5 =  load_model('best_model_5.h5')
scores_cnn = saved_model5.evaluate(X_val5, Y_val5, verbose=5)
X_pred_5 = saved_model5.predict(X_val5)

Epoch 1/3
Epoch 00001: val_accuracy improved from -inf to 0.87266, saving model to best_model_5.h5
Epoch 2/3
Epoch 00002: val_accuracy improved from 0.87266 to 0.90270, saving model to best_model_5.h5
Epoch 3/3
Epoch 00003: val_accuracy improved from 0.90270 to 0.90294, saving model to best_model_5.h5


In [8]:
# train with every training data 
# LSTM for the first stage

model_lstm = Sequential()
model_lstm.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(27, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5 
batch_size = 128

save_best_model = ModelCheckpoint("model/best_model_lstm.h5", monitor='accuracy', mode='max', save_best_only=True, verbose=1)

history = model_lstm.fit(X_train_total, Y_train_total, batch_size=batch_size, \
                    epochs=epochs,shuffle=True, callbacks=[save_best_model])

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.58783, saving model to model\best_model_lstm.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.58783 to 0.89823, saving model to model\best_model_lstm.h5
Epoch 3/5

Epoch 00003: accuracy improved from 0.89823 to 0.95140, saving model to model\best_model_lstm.h5
Epoch 4/5

Epoch 00004: accuracy improved from 0.95140 to 0.97272, saving model to model\best_model_lstm.h5
Epoch 5/5

Epoch 00005: accuracy improved from 0.97272 to 0.98511, saving model to model\best_model_lstm.h5


In [25]:
# first stage predict for test data
test_pred = model_lstm.predict(X_test)
test_pred_df = pd.DataFrame(test_pred)

c = []
for i in range(27):
    c.append("lstm_"+str(i))
    
test_pred_df.columns = c
test_pred_df.to_csv("test_lstm_X.csv", index=False)

In [26]:
# stacking

train_pred = np.concatenate((X_pred_1, X_pred_2, X_pred_3, X_pred_4, X_pred_5), axis=0)
train_pred_df = pd.DataFrame(train_pred)
train_pred_df.columns = c
train_pred_df.to_csv("train_lstm_X.csv", index=False)