# Code for evaluating the best Deep Learning Models for Text Classification

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from IPython.display import display
plt.style.use('ggplot')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
import os
print(os.listdir("../input"))

In [None]:
import pandas as pd
project_text_tokens_forLSTMBagging = pd.read_csv("/kaggle/input/bilstmbagging/project-text-tokens-for-LSTM-Bagging.csv")

In [None]:
project_text_tokens_forLSTMBagging['tokens'] = project_text_tokens_forLSTMBagging['tokens'].apply(lambda x: eval(x))
project_text_tokens_forLSTMBagging.dtypes

In [None]:
token_list = list(project_text_tokens_forLSTMBagging.tokens)

In [None]:
from keras import Input, Model
from keras.layers import Embedding, GlobalAveragePooling1D, Dense
class FastText(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        x = GlobalAveragePooling1D()(embedding)

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model

In [None]:
import numpy as np
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))
def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    # >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    # >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences
# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 1
max_features = 40000
maxlen = 240
batch_size = 32
embedding_dims = 60
epochs = 5
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(token_list)
proj_copy = project_text_tokens_forLSTMBagging.copy()
train_set = proj_copy.sample(frac=0.8, random_state=2)
test_set = proj_copy.drop(train_set.index) 
X_train = tokenizer.texts_to_sequences(train_set.tokens)
X_test = tokenizer.texts_to_sequences(test_set.tokens)
y_train = train_set.state
y_test = test_set.state
print('Loading data...')
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in X_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    X_train = add_ngram(X_train, token_indice, ngram_range)
    X_test = add_ngram(X_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))

print('Pad sequences (samples x time)...')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

print('Build model...')
model = FastText(maxlen, max_features, embedding_dims,1,'sigmoid').get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.summary()
print('Train...')
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
history = model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(X_test, y_test),
          verbose=1)

print('Test...')
result = model.predict(X_test)
print(result)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout

class TextCNN(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        # Embedding part can try multichannel as same as origin paper
        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        convs = []
        for kernel_size in [3, 4, 5]:
            c = Conv1D(128, kernel_size, activation='relu')(embedding)
            c = GlobalMaxPooling1D()(c)
            convs.append(c)
        x = Concatenate()(convs)

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model

In [None]:
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence

max_features = 32000
maxlen = 240
batch_size = 20
embedding_dims = 360
epochs = 2
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(token_list)
proj_copy = project_text_tokens_forLSTMBagging.copy()
train_set = proj_copy.sample(frac=0.8, random_state=2)
test_set = proj_copy.drop(train_set.index) 
x_train = tokenizer.texts_to_sequences(train_set.tokens)
x_test = tokenizer.texts_to_sequences(test_set.tokens)
y_train = train_set.state
y_test = test_set.state

print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model2 = TextCNN(maxlen, max_features, embedding_dims).get_model()
model2.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model2.summary()

print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
history = model2.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

print('Test...')
result2 = model2.predict(x_test)
print(result2)
loss, accuracy = model2.evaluate(x_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model2.evaluate(x_test, y_test, verbose=True)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
from keras import Model
from keras.layers import Embedding, Dense, LSTM

class TextRNN(Model):
    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(TextRNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.rnn = LSTM(128)  # LSTM or GRU
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of TextRNN must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of TextRNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        x = self.rnn(embedding)
        output = self.classifier(x)
        return output

In [None]:
import tensorflow
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence

max_features = 30000
maxlen = 200
batch_size = 64
embedding_dims = 60
epochs = 2
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(token_list)
proj_copy = project_text_tokens_forLSTMBagging.copy()
train_set = proj_copy.sample(frac=0.8, random_state=2)
test_set = proj_copy.drop(train_set.index) 
x_train = tokenizer.texts_to_sequences(train_set.tokens)
x_test = tokenizer.texts_to_sequences(test_set.tokens)
y_train = train_set.state
y_test = test_set.state

print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)
pass
print('Build model...')
model3 = TextRNN(maxlen, max_features, embedding_dims)
model3.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model3.fit(x_train, y_train.values,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test,y_test.values))
print(model3.summary())
print('Test...')
result3 = model3.predict(x_test)
print(result3)
loss, accuracy = model3.evaluate(x_train, y_train.values, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model3.evaluate(x_test, y_test.values, verbose=True)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional

class TextBiRNN(Model):
    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(TextBiRNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.bi_rnn = Bidirectional(LSTM(128))  # LSTM or GRU
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of TextBiRNN must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of TextBiRNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        x = self.bi_rnn(embedding)
        output = self.classifier(x)
        return output

In [None]:
import tensorflow
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence

max_features = 40000
maxlen = 220
batch_size = 40
embedding_dims = 60
epochs = 2
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(token_list)
proj_copy = project_text_tokens_forLSTMBagging.copy()
train_set = proj_copy.sample(frac=0.8, random_state=2)
test_set = proj_copy.drop(train_set.index) 
x_train = tokenizer.texts_to_sequences(train_set.tokens)
x_test = tokenizer.texts_to_sequences(test_set.tokens)
y_train = train_set.state
y_test = test_set.state

print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)
pass
print('Build model...')
model4 = TextBiRNN(maxlen, max_features, embedding_dims)
model4.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model4.fit(x_train, y_train.values,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test,y_test.values))
print(model3.summary())
print('Test...')
result4 = model4.predict(x_test)
print(result4)
loss, accuracy = model4.evaluate(x_train, y_train.values, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model4.evaluate(x_test, y_test.values, verbose=True)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
print(model4.summary())

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, Lambda, Concatenate, Conv1D, GlobalMaxPooling1D


class RCNN(Model):
    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(RCNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.forward_rnn = SimpleRNN(128, return_sequences=True)
        self.backward_rnn = SimpleRNN(128, return_sequences=True, go_backwards=True)
        self.reverse = Lambda(lambda x: tf.reverse(x, axis=[1]))
        self.concatenate = Concatenate(axis=2)
        self.conv = Conv1D(64, kernel_size=1, activation='tanh')
        self.max_pooling = GlobalMaxPooling1D()
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs) != 3:
            raise ValueError('The length of inputs of RCNN must be 3, but now is %d' % len(inputs))
        input_current = inputs[0]
        input_left = inputs[1]
        input_right = inputs[2]
        if len(input_current.get_shape()) != 2 or len(input_left.get_shape()) != 2 or len(input_right.get_shape()) != 2:
            raise ValueError('The rank of inputs of RCNN must be (2, 2, 2), but now is (%d, %d, %d)' % (len(input_current.get_shape()), len(input_left.get_shape()), len(input_right.get_shape())))
        if input_current.get_shape()[1] != self.maxlen or input_left.get_shape()[1] != self.maxlen or input_right.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of RCNN must be (%d, %d, %d), but now is (%d, %d, %d)' % (self.maxlen, self.maxlen, self.maxlen, input_current.get_shape()[1], input_left.get_shape()[1], input_right.get_shape()[1]))
        embedding_current = self.embedding(input_current)
        embedding_left = self.embedding(input_left)
        embedding_right = self.embedding(input_right)
        x_left = self.forward_rnn(embedding_left)
        x_right = self.backward_rnn(embedding_right)
        x_right = self.reverse(x_right)
        x = self.concatenate([x_left, embedding_current, x_right])
        x = self.conv(x)
        x = self.max_pooling(x)
        output = self.classifier(x)
        return output

In [None]:
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing import sequence

max_features = 32000
maxlen = 240
batch_size = 20
embedding_dims = 360
epochs = 3
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(token_list)
proj_copy = project_text_tokens_forLSTMBagging.copy()
train_set = proj_copy.sample(frac=0.8, random_state=2)
test_set = proj_copy.drop(train_set.index) 
x_train = tokenizer.texts_to_sequences(train_set.tokens)
x_test = tokenizer.texts_to_sequences(test_set.tokens)
y_train = train_set.state.values
y_test = test_set.state.values
print('Loading data...')
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('Prepare input for model...')
x_train_current = x_train
x_train_left = np.hstack([np.expand_dims(x_train[:, 0], axis=1), x_train[:, 0:-1]])
x_train_right = np.hstack([x_train[:, 1:], np.expand_dims(x_train[:, -1], axis=1)])
x_test_current = x_test
x_test_left = np.hstack([np.expand_dims(x_test[:, 0], axis=1), x_test[:, 0:-1]])
x_test_right = np.hstack([x_test[:, 1:], np.expand_dims(x_test[:, -1], axis=1)])
print('x_train_current shape:', x_train_current.shape)
print('x_train_left shape:', x_train_left.shape)
print('x_train_right shape:', x_train_right.shape)
print('x_test_current shape:', x_test_current.shape)
print('x_test_left shape:', x_test_left.shape)
print('x_test_right shape:', x_test_right.shape)
print('Build model...')
model5 = RCNN(maxlen, max_features, embedding_dims)
model5.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model5.fit([x_train_current, x_train_left, x_train_right], y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=([x_test_current, x_test_left, x_test_right], y_test))
print(model5.summary())
print('Test...')
result5 = model5.predict([x_test_current, x_test_left, x_test_right])
print(result5)
loss, accuracy = model5.evaluate([x_train_current, x_train_left, x_train_right], y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model5.evaluate([x_test_current, x_test_left, x_test_right], y_test, verbose=True)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
y_test.values

In [None]:
outcome = pd.DataFrame({'id': list(test_set.project_id), 'state': list(result.flatten())})
outcome.to_csv('fast-text-res.csv',index=False)
outcome.head(5)

In [None]:
outcome2 = pd.DataFrame({'id': list(test_set.project_id), 'state': list(result2.flatten())})
outcome2.to_csv('text-cnn-res.csv',index=False)
outcome2.head(5)

In [None]:
outcome3 = pd.DataFrame({'id': list(test_set.project_id), 'state': list(result3.flatten())})
outcome3.to_csv('text-rnn-res.csv',index=False)
outcome3.head(5)

In [None]:
outcome4 = pd.DataFrame({'id': list(test_set.project_id), 'state': list(result4.flatten())})
outcome4.to_csv('text-birnn-res.csv',index=False)
outcome4.head(5)

In [None]:
outcome5 = pd.DataFrame({'id': list(test_set.project_id), 'state': list(result5.flatten())})
outcome5.to_csv('text-rcnn-res.csv',index=False)
outcome5.head(5)