# RNN and LSTM Train
This notebook will train an RNN and LSTM models on the Patent Classification and Bankings77 Dataset for different regularization values. It will load in the different raw data files and preprocess the data and then train each model for the regularization parameters.

In [None]:
import json
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import tensorflow as tf
from dropconnect_tensorflow import DropConnect
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.regularizers import l1, l2

## Initialization

In [None]:
nltk.download('stopwords')

In [None]:
seed = 22
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.keras.utils.set_random_seed(seed)

In [None]:
def load_patent_data(data_path):
    train = [json.loads(line) for line in open(f'{data_path}/train_data.txt', 'rb')]
    val = [json.loads(line) for line in open(f'{data_path}/val_data.txt', 'rb')]
    test = [json.loads(line) for line in open(f'{data_path}/test_data.txt', 'rb')]
    data = train + val + test
    text = [d['abstract'] for d in data]
    labels = [d['label'] for d in data]
    temp = list(zip(text, labels))
    random.shuffle(temp)
    text, labels = zip(*temp)
    return text[:20000], labels[:20000]

In [None]:
def load_bankings_data(data_path):
    train = pd.read_csv(f'{data_path}/train.csv')
    test = pd.read_csv(f'{data_path}/test.csv')
    data = pd.concat([train, test])
    texts = data['text'].tolist()
    labels = data['category'].tolist()
    temp = list(zip(texts, labels))
    random.shuffle(temp)
    texts, labels = zip(*temp)
    texts, labels = list(texts), list(labels)
    return texts[:8000], labels[:8000]

In [None]:
def load_article_data(data_path):
    texts, labels = [], []
    data = pd.read_csv(f'{data_path}/bbc-text.csv')
    for row in data.iterrows():
        texts.append(row[1].text)
        labels.append(row[1].category)
    return texts, labels

In [None]:
def load_data(data_path):
    if os.path.basename(data_path) == 'patent_classification':
        texts, labels = load_patent_data(data_path)
    elif os.path.basename(data_path) == 'bankings_77':
        texts, labels = load_bankings_data(data_path)
    elif os.path.basename(data_path) == 'article_classification':
        texts, labels = load_article_data(data_path)
    return texts, labels

In [None]:
def get_preprocessed_data(texts, labels):
    special_symbols = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))
    processed_texts = []
    for i, text in enumerate(texts):
        processed_text = text.lower()
        processed_text = special_symbols.sub(' ', processed_text)
        processed_text = bad_symbols.sub('', processed_text)
        processed_text = ' '.join(word for word in processed_text.split() if word not in stopwords_set)
        processed_texts.append(processed_text)
    return processed_texts, labels

In [None]:
def split_data(texts, labels, validation_size, test_size):
    sequences, test_sequences, labels, test_labels = train_test_split(texts, labels, test_size=test_size, shuffle=False)
    train_sequences, validation_sequences, train_labels, validation_labels = train_test_split(sequences, labels, test_size=validation_size, shuffle=False)
    return [train_sequences, validation_sequences, test_sequences], [train_labels, validation_labels, test_labels]

In [None]:
def tokenize_data(X, y, labels, vocab_size, oov_token, padding_size, padding_type):
    sequence_tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
    sequence_tokenizer.fit_on_texts(X[0])
    X_train = sequence_tokenizer.texts_to_sequences(X[0])
    X_train = pad_sequences(X_train, maxlen=padding_size, padding=padding_type)
    X_validation = sequence_tokenizer.texts_to_sequences(X[1])
    X_validation = pad_sequences(X_validation, maxlen=padding_size, padding=padding_type)
    X_test = sequence_tokenizer.texts_to_sequences(X[2])
    X_test = pad_sequences(X_test, maxlen=padding_size, padding=padding_type)
    label_tokenizer = LabelEncoder()
    label_tokenizer.fit(labels)
    y_train = label_tokenizer.transform(y[0])
    y_validation = label_tokenizer.transform(y[1])
    y_test = label_tokenizer.transform(y[2])
    return [X_train, X_validation, X_test], [y_train, y_validation, y_test]

In [None]:
def get_model(vocab_size, hidden_layer_size, activation, num_labels, weight_decay, dropout_rate, dropconnect_rate, model_type):
    if model_type == 'rnn':
        sequential_layer = tf.keras.layers.SimpleRNN(hidden_layer_size, activation, kernel_regularizer=weight_decay, dropout=dropout_rate)
    elif model_type == 'lstm':
        sequential_layer = tf.keras.layers.LSTM(hidden_layer_size, activation, kernel_regularizer=weight_decay, dropout=dropout_rate)
    sequential_layer = tf.keras.layers.Bidirectional(sequential_layer)
    if dropconnect_rate > 0:
        sequential_layer = DropConnect(sequential_layer, prob=dropconnect_rate)
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, hidden_layer_size),
        sequential_layer,
        tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        tf.keras.layers.Dense(num_labels, activation='softmax')
    ])
    return model

In [None]:
def run_model(X, y, model, loss, optimizer, metrics, epochs, batch_size, save_path, model_name):
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    history = model.fit(X[0], y[0], batch_size=batch_size, epochs=epochs, validation_data=(X[1], y[1]))
    model.evaluate(X[2], y[2], batch_size=batch_size)
    model.save(f'{save_path}/{model_name}.h5')
    with open(f'{save_path}/{model_name}.pickle', 'wb') as f:
        pickle.dump(history.history, f)

In [None]:
save_path = './Saved Models'
validation_size = 0.15
test_size = 0.2
vocab_size = 5000
oov_token = '<OOV>'
padding_size = 200
padding_type = 'post'

## Patent Classification Dataset Training

In [None]:
data_path = './Data/patent_classification'
texts, labels = load_data(data_path)
processed_text, labels = get_preprocessed_data(texts, labels)
X, y = split_data(processed_text, labels, validation_size, test_size)
X, y = tokenize_data(X, y, labels, vocab_size, oov_token, padding_size, padding_type)

In [None]:
hidden_layer_size = 128
num_labels = len(set(labels))
loss = 'sparse_categorical_crossentropy'
optimizer = 'adam'
metrics = ['accuracy']
epochs = 10
batch_size = 128
weight_decays_l1 = [0, 0.001, 0.01, 0.1]
weight_decays_l2 = [0.001, 0.01, 0.1]
dropout_rates = [0.1, 0.2, 0.3]
dropconnect_rates = [0.1, 0.2, 0.3]
dataset = os.path.basename(data_path)

In [None]:
for weight_decay_l1 in weight_decays_l1:
    print(f'Training RNN on {dataset} with {weight_decay_l1} L1 Weight Decay...')
    weight_decay = l1(weight_decay_l1) if weight_decay_l1 > 0 else None  
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, weight_decay, 0, 0, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_l1_weight_decay_{weight_decay_l1}_{dataset}')

In [None]:
for weight_decay_l2 in weight_decays_l2:
    print(f'Training RNN on {dataset} with {weight_decay_l2} L2 Weight Decay...')
    weight_decay = l2(weight_decay_l2) if weight_decay_l2 > 0 else None
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, weight_decay, 0, 0, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_l2_weight_decay_{weight_decay_l2}_{dataset}')

In [None]:
for dropout_rate in dropout_rates:
    print(f'Training RNN on {dataset} with {dropout_rate} Dropout...')
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, None, dropout_rate, 0, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_dropout_{dropout_rate}_{dataset}')

In [None]:
rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, None, 0.2, 0, 'rnn')
run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_dropout_0.2_patent_classification')

In [None]:
for dropconnect_rate in dropconnect_rates:
    print(f'Training RNN on {dataset} with {dropconnect_rate} Dropconnect...')
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, None, 0, dropconnect_rate, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_dropconnect_{dropconnect_rate}_{dataset}')

In [None]:
hidden_layer_size = 128
num_labels = len(set(labels))
loss = 'sparse_categorical_crossentropy'
optimizer = 'adam'
metrics = ['accuracy']
epochs = 10
batch_size = 64
weight_decays_l1 = [0, 0.001, 0.01, 0.1]
weight_decays_l2 = [0.001, 0.01, 0.1]
dropout_rates = [0.1, 0.2, 0.3]
dropconnect_rates = [0.1, 0.2, 0.3]
dataset = os.path.basename(data_path)

In [None]:
for weight_decay_l1 in weight_decays_l1:
    print(f'Training LSTM on {dataset} with {weight_decay_l1} L1 Weight Decay...')
    weight_decay = l1(weight_decay_l1) if weight_decay_l1 > 0 else None
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, weight_decay, 0, 0, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_l1_weight_decay_{weight_decay_l1}_{dataset}')

In [None]:
for weight_decay_l2 in weight_decays_l2:
    print(f'Training LSTM on {dataset} with {weight_decay_l2} L2 Weight Decay...')
    weight_decay = l2(weight_decay_l2) if weight_decay_l2 > 0 else None
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, weight_decay, 0, 0, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_l2_weight_decay_{weight_decay_l2}_{dataset}')

In [None]:
for dropout_rate in dropout_rates:
    print(f'Training LSTM on {dataset} with {dropout_rate} Dropout...')
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, None, dropout_rate, 0, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_dropout_{dropout_rate}_{dataset}')

In [None]:
for dropconnect_rate in dropconnect_rates:
    print(f'Training LSTM on {dataset} with {dropconnect_rate} Dropconnect...')
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, None, 0, dropconnect_rate, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_dropconnect_{dropconnect_rate}_{dataset}')

## Bankings77 Dataset Training

In [None]:
data_path = './Data/bankings_77'
texts, labels = load_data(data_path)
processed_text, labels = get_preprocessed_data(texts, labels)
X, y = split_data(processed_text, labels, validation_size, test_size)
X, y = tokenize_data(X, y, labels, vocab_size, oov_token, padding_size, padding_type)

In [None]:
hidden_layer_size = 128
num_labels = len(set(labels))
loss = 'sparse_categorical_crossentropy'
optimizer = 'adam'
metrics = ['accuracy']
epochs = 60
batch_size = 128
weight_decays_l1 = [0, 0.001, 0.01, 0.1]
weight_decays_l2 = [0.001, 0.01, 0.1]
dropout_rates = [0.1, 0.2, 0.3]
dropconnect_rates = [0.1, 0.2, 0.3]
dataset = os.path.basename(data_path)

In [None]:
for weight_decay_l1 in weight_decays_l1:
    print(f'Training RNN on {dataset} with {weight_decay_l1} L1 Weight Decay...')
    weight_decay = l1(weight_decay_l1) if weight_decay_l1 > 0 else None  
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, weight_decay, 0, 0, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_l1_weight_decay_{weight_decay_l1}_{dataset}')

In [None]:
for weight_decay_l2 in weight_decays_l2:
    print(f'Training RNN on {dataset} with {weight_decay_l2} L2 Weight Decay...')
    weight_decay = l2(weight_decay_l2) if weight_decay_l2 > 0 else None
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, weight_decay, 0, 0, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_l2_weight_decay_{weight_decay_l2}_{dataset}')

In [None]:
for dropout_rate in dropout_rates:
    print(f'Training RNN on {dataset} with {dropout_rate} Dropout...')
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, None, dropout_rate, 0, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_dropout_{dropout_rate}_{dataset}')

In [None]:
for dropconnect_rate in dropconnect_rates:
    print(f'Training RNN on {dataset} with {dropconnect_rate} Dropconnect...')
    rnn_model = get_model(vocab_size, hidden_layer_size, 'relu', num_labels, None, 0, dropconnect_rate, 'rnn')
    run_model(X, y, rnn_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'rnn_dropconnect_{dropconnect_rate}_{dataset}')

In [None]:
hidden_layer_size = 64
num_labels = len(set(labels))
loss = 'sparse_categorical_crossentropy'
optimizer = 'adam'
metrics = ['accuracy']
epochs = 80
batch_size = 64
weight_decays_l1 = [0, 0.001, 0.01, 0.1]
weight_decays_l2 = [0.001, 0.01, 0.1]
dropout_rates = [0.1, 0.2, 0.3]
dropconnect_rates = [0.1, 0.2, 0.3]
dataset = os.path.basename(data_path)

In [None]:
for weight_decay_l1 in weight_decays_l1:
    print(f'Training LSTM on {dataset} with {weight_decay_l1} L1 Weight Decay...')
    weight_decay = l1(weight_decay_l1) if weight_decay_l1 > 0 else None
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, weight_decay, 0, 0, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_l1_weight_decay_{weight_decay_l1}_{dataset}')

In [None]:
for weight_decay_l2 in weight_decays_l2:
    print(f'Training LSTM on {dataset} with {weight_decay_l2} L2 Weight Decay...')
    weight_decay = l2(weight_decay_l2) if weight_decay_l2 > 0 else None
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, weight_decay, 0, 0, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_l2_weight_decay_{weight_decay_l2}_{dataset}')

In [None]:
for dropout_rate in dropout_rates:
    print(f'Training LSTM on {dataset} with {dropout_rate} Dropout...')
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, None, dropout_rate, 0, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_dropout_{dropout_rate}_{dataset}')

In [None]:
for dropconnect_rate in dropconnect_rates:
    print(f'Training LSTM on {dataset} with {dropconnect_rate} Dropconnect...')
    lstm_model = get_model(vocab_size, hidden_layer_size, 'tanh', num_labels, None, 0, dropconnect_rate, 'lstm')
    run_model(X, y, lstm_model, loss, optimizer, metrics, epochs, batch_size, save_path, f'lstm_dropconnect_{dropconnect_rate}_{dataset}')

### References
The references and resources were used for developing the code.
- https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
- https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35
- https://huggingface.co/docs/datasets/index