### Install dependency packages

In [2]:
# !pip install gensim==4.1.2
# !pip3 install torch 

In [3]:
# import libraries to be used
from sklearn import preprocessing
import pandas as pd
import numpy as np
import re
import os
import gensim
from collections import Counter
from tqdm.notebook import tqdm
from string import punctuation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
import logging
from torch.utils.tensorboard import SummaryWriter
from torch.optim import AdamW
import io
import pickle


# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logging.basicConfig(level=logging.INFO,
                    filename='fasttext_lstm.log',
                    filemode='w',
                    format='%(asctime)s - %(message)s')

### 1. Read date

In [4]:
def read_data(file):
    data = pd.read_csv(file)
    return list(data['text']), list(data['label'])

In [5]:
train_texts, train_labels = read_data("train.csv")
test_texts, test_labels = read_data("test.csv")

In [6]:
texts = train_texts + test_texts
labels = train_labels + test_labels

In [7]:
Counter(labels)

Counter({0: 10000, 4: 10000})

### 2. Code-conversion of labels, category mapping

In [8]:
label2id = {item: idx for idx, item in enumerate(sorted(set(labels)))}
id2label = {v: k for k, v in label2id.items()}

In [9]:
y_cate = np.asarray([label2id[i] for i in labels])

### 3. Load FastText

In [10]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [11]:
vector = load_vectors('wiki-news-300d-1M.vec')

In [12]:
# construct a vocab.pklist with all words
with open('vocab.pkl', 'rb') as f:
    vocab_list = pickle.load(f)

In [13]:
# initialize matrix that stores all vectors
embeddings_matrix = np.zeros((len(vocab_list) + 1, 300))
# populate dictionary and matrix above
for i in range(len(vocab_list)):
    word = vocab_list[i]
    try:
        embeddings_matrix[i] = vector[word]
    except:
        embeddings_matrix[i] = np.zeros(300)
vocab_size = len(vocab_list) + 1

### 4. Tokenisation and build word-id dictionary

In [14]:
X = []
maxlen = 94
for text in texts:
    temp = []
    for word in text.split(' '):
        temp.append(vocab_list.index(word))
    if len(temp) < maxlen:
        temp += [vocab_size - 1] * (maxlen - len(temp))
    else:
        temp = temp[:maxlen]
    X.append(temp)
X = np.array(X)

### 5. Divide training and testing set

In [15]:
X_train = X[:len(train_texts)]
y_train = y_cate[:len(train_texts)]
X_val = X[len(train_texts):]
y_val = y_cate[len(train_texts):]

In [16]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(18000, 90) (18000,) (2000, 90) (2000,)


### 6. Build model

In [17]:
class Model(nn.Module):
    def __init__(self,
                 embeddings_matrix,
                 num_classes=2,
                 embed_size=300,
                 hidden_size=300,
                 inner_size=100,
                 num_layers=1,
                 dropout=0.2):
        super(Model, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings_matrix,
                                                      freeze=False)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
                            bidirectional=False,
                            batch_first=True,
                            dropout=dropout)
        self.fc = nn.Linear(hidden_size, inner_size)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(inner_size, num_classes)

    def forward(self, x):
        out = self.embedding(x)  # [batch_size, seq_len, embeding]
        output, (last_hidden, c) = self.lstm(out)
        out = self.fc(torch.max(output, dim=1)[0])
        out = self.activation(out)
        out = self.dropout(out)
        out = self.classifier(out)
        return out

In [18]:
model = Model(torch.from_numpy(embeddings_matrix).float(),
              num_classes=len(label2id))
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device(
    'cpu')  # use cpu or gpu
model.to(device)
model.train()

  "num_layers={}".format(dropout, num_layers))


Model(
  (embedding): Embedding(21704, 300)
  (lstm): LSTM(300, 300, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=300, out_features=100, bias=True)
  (activation): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (classifier): Linear(in_features=100, out_features=2, bias=True)
)

### 7. Build dataset and dataloader

In [19]:
X_train = torch.from_numpy(X_train)
X_val = torch.from_numpy(X_val)
y_train = torch.as_tensor(y_train, dtype=torch.int64)
y_val = torch.as_tensor(y_val, dtype=torch.int64)

batch_size = 64
train_data = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=batch_size,
                              drop_last=False)

valid_data = TensorDataset(X_val, y_val)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size)

### 8. Calculate evaluation metrics

In [20]:
def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')
    report = classification_report(labels, preds, digits=4)
    logging.info(f'accuracy: {accuracy}')
    logging.info(f'precision: {precision}')
    logging.info(f'recall: {recall}')
    logging.info(f'f1: {f1}')
    logging.info(f'report: {report}\n')
    return accuracy, precision, recall, f1

### 9. Model evaluation function

In [21]:
@torch.no_grad()
def eval_model(model, eval_loader):
    model.eval()
    labels = []
    preds = []
    for idx, batch in enumerate(eval_loader):
        x = batch[0].to(device)
        labels.extend(batch[1].numpy())
        outputs = model(x)  # output all probabilities
        preds.extend(torch.argmax(outputs, dim=-1).cpu().numpy())  # get label
    accuracy, precision, recall, f1 = compute_metrics(labels, preds)
    model.train()
    return accuracy, precision, recall, f1

### 10. Model training

In [22]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate':
    0.0
}]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5) #6e-3

In [23]:
step = 0
best_acc = 0
epoch = 30
model_path = 'model_lstm_best'
writer = SummaryWriter(log_dir=model_path)
loss_func = nn.CrossEntropyLoss()
for epoch in tqdm(range(epoch), desc='Epoch'):
    losses = []
    for idx, batch in tqdm(enumerate(train_dataloader),
                           total=len(X_train) // batch_size,
                           desc='Batch'):
        optimizer.zero_grad()
        x = batch[0].to(device)
        y = batch[1].to(device)
        outputs = model(x)
        loss = loss_func(outputs, y)  # caculate loss
        logging.info(
            f'Epoch-{epoch}, Step-{step}, Loss: {loss.cpu().detach().numpy()}')
        step += 1
        loss.backward()
        optimizer.step()
        losses.append(loss)
    writer.add_scalar('train_loss', loss.mean().item(), epoch)
    logging.info(
        f'Epoch {epoch}, present best acc: {best_acc}, start evaluating.')
    accuracy, precision, recall, f1 = eval_model(model,
                                                 valid_dataloader)  # evaluate model
    writer.add_scalar('dev_accuracy', accuracy, epoch)
    writer.add_scalar('dev_precision', precision, epoch)
    writer.add_scalar('dev_recall', recall, epoch)
    writer.add_scalar('dev_f1', f1, epoch)
    if accuracy > best_acc:
        torch.save(model.state_dict(), os.path.join(model_path, 'model.ckpt'))
        best_acc = accuracy

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]

Batch:   0%|          | 0/281 [00:00<?, ?it/s]