# Text classification with CNN advanced

220603

- [ref](https://towardsdatascience.com/text-classification-with-cnns-in-pytorch-1113df31e79f)

In [None]:
## download data
# !git clone https://github.com/airobotlab/lecture_NLP_basic
# !mv lecture_NLP_basic/data .
# !rm -rf lecture_NLP_basic

In [1]:
## import
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import re
import random
import nltk
from nltk.tokenize import word_tokenize
import torch
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

## fix seed
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(0)

In [2]:
# download dataset, https://www.kaggle.com/competitions/nlp-getting-started/data

model_type = 'CNN'
# model_type = 'RNN'


data_train = 'data/nlp_disaster_tweets/train.csv'
data_test = 'data/nlp_disaster_tweets/test.csv'

class CNN_config_define:
    def __init__(self):
        self.num_words= 2000
        self.max_seq_len = 35
        
        self.embedding_size= 100
        self.out_size= 32
        self.stride= 2
        
        self.batch_size = 12
        self.epochs = 20
        self.learning_rate= 0.001

class LSTM_config_define:
    def __init__(self):
        self.num_words= 2000
        self.max_seq_len = 35
        
        self.embedding_size= 100
        self.hidden_dim= 32  
        self.lstm_layers= 3
        
        self.batch_size = 12
        self.epochs = 20
        self.learning_rate= 0.001
        
if model_type == 'CNN':
    config = CNN_config_define()
elif model_type == 'RNN':
    config = LSTM_config_define()

In [3]:
## prepare dataset
df = pd.read_csv(data_train)
df.drop(['id', 'keyword', 'location'], axis=1, inplace=True)
x_raw = df['text'].values
y = df['target'].values

# 1) clean_text
x_raw_clean = [x.lower() for x in x_raw]
x_raw_clean = [re.sub(r'[^A-Za-z]+', ' ', x) for x in x_raw_clean]

# 2) text_tokenization
x_raw_tokenization = [word_tokenize(x) for x in x_raw_clean]

print('%s \n -> %s'%(x_raw_clean[1], x_raw_tokenization[1]))

# 3) build_vocabulary
def build_vocabulary(x_raw_tokenization, num_words=2000):
    # Builds the vocabulary and keeps the "x" most frequent word
    vocabulary = dict()
    fdist = nltk.FreqDist()

    for sentence in x_raw_tokenization:
        for word in sentence:
            fdist[word] += 1

    common_words = fdist.most_common(num_words)

    for idx, word in enumerate(common_words):
        vocabulary[word[0]] = (idx+1)
    return vocabulary

vocabulary = build_vocabulary(x_raw_tokenization, num_words=2000)

# 4) word_to_idx
def word_to_idx(vocabulary, x_raw_tokenization):
    # By using the dictionary (vocabulary), it is transformed
    # each token into its index based representatio
    x_tokenized = list()

    for sentence in x_raw_tokenization:
        temp_sentence = list()
        for word in sentence:
            if word in vocabulary.keys():
                temp_sentence.append(vocabulary[word])
        x_tokenized.append(temp_sentence)
        
    return x_tokenized

x_tokenized = word_to_idx(vocabulary, x_raw_tokenization)
print(' -> %s'%(x_tokenized[1]))

# 5) padding_sentences
def padding_sentences(x_tokenized, seq_len=35):
    # Each sentence which does not fulfill the required le
    # it's padded with the index 0
    pad_idx = 0
    x_padded = list()

    for sentence in x_tokenized:
        while len(sentence) < seq_len:
            sentence.insert(len(sentence), pad_idx)
        x_padded.append(sentence)

    x_padded = np.array(x_padded)
    return x_padded
    
x_padded = padding_sentences(x_tokenized, seq_len=config.max_seq_len)
print(' -> %s'%(x_padded[1]))

# 6) split data
x_train, x_test, y_train, y_test = train_test_split(x_padded, y, test_size=0.2, random_state=42)
print('train: %s / %s  - valid: %s / %s' %(x_train.shape, y_train.shape, x_test.shape, y_test.shape))

data = {}
data['x_train'] = x_train
data['y_train'] = y_train
data['x_test']  = x_test
data['y_test']  = y_test

forest fire near la ronge sask canada 
 -> ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']
 -> [208, 49, 240, 710, 1198]
 -> [ 208   49  240  710 1198    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
train: (6090, 35) / (6090,)  - valid: (1523, 35) / (1523,)


In [4]:
# make dataloader
import torch
from torch.utils.data import Dataset, DataLoader
class DatasetMapper(Dataset):

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    

# Initialize dataset maper
dataset_train = DatasetMapper(data['x_train'], data['y_train'])
dataset_test = DatasetMapper(data['x_test'], data['y_test'])

In [5]:
# functions
def count_parameters(model):
    print('model parameters: %d'%(sum(p.numel() for p in model.parameters() if p.requires_grad)))
    return

def evaluation(model, loader_test):
    # Set the model in evaluation mode
    model.eval()
    predictions = []

    # Starst evaluation phase
    with torch.no_grad():
        for x_batch, y_batch in loader_test:
            y_pred = model(x_batch)
            predictions += list(y_pred.detach().numpy())
    return predictions

def calculate_accuray(grand_truth, predictions):
    true_positives = 0
    true_negatives = 0

    # Gets frequency  of true positives and true negatives
    # The threshold is 0.5
    for true, pred in zip(grand_truth, predictions):
        if (pred >= 0.5) and (true == 1):
            true_positives += 1
        elif (pred < 0.5) and (true == 0):
            true_negatives += 1
        else:
            pass
    # Return accuracy
    return (true_positives+true_negatives) / len(grand_truth)

In [6]:
# define CNN model
import torch
import torch.nn as nn
import math

class CNN1D_Classifier(nn.ModuleList):
    def __init__(self, config):
        super(CNN1D_Classifier, self).__init__()

        # Parameters regarding text preprocessing
        self.seq_len = config.seq_len
        self.num_words = config.num_words
        self.embedding_size = config.embedding_size

        # Dropout definition
        self.dropout = nn.Dropout(0.25)

        # CNN parameters definition
        # Kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5

        # Output size for each convolution
        self.out_size = config.out_size
        # Number of strides for each convolution
        self.stride = config.stride

        # Embedding layer definition
        self.embedding = nn.Embedding(
            self.num_words + 1, self.embedding_size, padding_idx=0)

        # Convolution layers definition
        self.conv_1 = nn.Conv1d(
            self.seq_len, self.out_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(
            self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(
            self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(
            self.seq_len, self.out_size, self.kernel_4, self.stride)

        # Max pooling layers definition
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)

        # Fully connected layer definition
        self.fc = nn.Linear(self.in_features_fc(), 1)

        
    def in_features_fc(self):
        '''Calculates the number of output features after Convolution + Max pooling

        Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1

        source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        '''
        # Calcualte size of convolved/pooled features for convolution_1/max_pooling_1 features
        out_conv_1 = ((self.embedding_size - 1 *
                      (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = (
            (out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)

        # Calcualte size of convolved/pooled features for convolution_2/max_pooling_2 features
        out_conv_2 = ((self.embedding_size - 1 *
                      (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_conv_2 = math.floor(out_conv_2)
        out_pool_2 = (
            (out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_pool_2 = math.floor(out_pool_2)

        # Calcualte size of convolved/pooled features for convolution_3/max_pooling_3 features
        out_conv_3 = ((self.embedding_size - 1 *
                      (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_conv_3 = math.floor(out_conv_3)
        out_pool_3 = (
            (out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_pool_3 = math.floor(out_pool_3)

        # Calcualte size of convolved/pooled features for convolution_4/max_pooling_4 features
        out_conv_4 = ((self.embedding_size - 1 *
                      (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_conv_4 = math.floor(out_conv_4)
        out_pool_4 = (
            (out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_pool_4 = math.floor(out_pool_4)

        # Returns "flattened" vector (input for fully connected layer)
        return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size


    def forward(self, x):

        # Sequence of tokes is filterd through an embedding layer
        x = self.embedding(x)

        # Convolution layer 1 is applied
        x1 = self.conv_1(x)
        x1 = torch.relu(x1)
        x1 = self.pool_1(x1)

        # Convolution layer 2 is applied
        x2 = self.conv_2(x)
        x2 = torch.relu((x2))
        x2 = self.pool_2(x2)

        # Convolution layer 3 is applied
        x3 = self.conv_3(x)
        x3 = torch.relu(x3)
        x3 = self.pool_3(x3)

        # Convolution layer 4 is applied
        x4 = self.conv_4(x)
        x4 = torch.relu(x4)
        x4 = self.pool_4(x4)

        # The output of each convolutional layer is concatenated into a unique vector
        union = torch.cat((x1, x2, x3, x4), 2)
        union = union.reshape(union.size(0), -1)

        # The "flattened" vector is passed through a fully connected layer
        out = self.fc(union)
        # Dropout is applied
        out = self.dropout(out)
        # Activation function is applied
        out = torch.sigmoid(out)

        return out.squeeze()
    
model = CNN1D_Classifier(config)
count_parameters(model)

model parameters: 218917


In [7]:
## train!!
# Initialize loaders
loader_train = DataLoader(dataset_train, batch_size=config.batch_size)
loader_test = DataLoader(dataset_test, batch_size=config.batch_size)

# Define optimizer
optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)

# Starts training phase
start_time = time.time()
for epoch in range(config.epochs):
    # Set model in training model
    model.train()
    predictions = []
    # Starts batch training
    for batch_idx, (x_batch, y_batch) in enumerate(loader_train):

        y_batch = y_batch.type(torch.FloatTensor)

        # Feed the model
        y_pred = model(x_batch)

        # Loss calculation
        loss = F.binary_cross_entropy(y_pred, y_batch)

        # Clean gradientes
        optimizer.zero_grad()

        # Gradients calculation
        loss.backward()

        # Gradients update
        optimizer.step()

        # Save predictions
        predictions += list(y_pred.detach().numpy())

    # Evaluation phase
    test_predictions = evaluation(model, loader_test)

    # Metrics calculation
    train_accuary = calculate_accuray(data['y_train'], predictions)
    test_accuracy = calculate_accuray(data['y_test'], test_predictions)
    print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (
        epoch+1, loss.item(), train_accuary, test_accuracy))
    
print('train done!!, %.1f sec'%(time.time()-start_time))

Epoch: 1, loss: 0.72825, Train accuracy: 0.55435, Test accuracy: 0.59357
Epoch: 2, loss: 0.63252, Train accuracy: 0.61182, Test accuracy: 0.69993
Epoch: 3, loss: 0.63067, Train accuracy: 0.65846, Test accuracy: 0.66645
Epoch: 4, loss: 0.64781, Train accuracy: 0.69327, Test accuracy: 0.66316
Epoch: 5, loss: 0.54090, Train accuracy: 0.71051, Test accuracy: 0.72554
Epoch: 6, loss: 0.50347, Train accuracy: 0.73383, Test accuracy: 0.74590
Epoch: 7, loss: 0.44245, Train accuracy: 0.73990, Test accuracy: 0.73670
Epoch: 8, loss: 0.44165, Train accuracy: 0.76437, Test accuracy: 0.73014
Epoch: 9, loss: 0.33231, Train accuracy: 0.77258, Test accuracy: 0.74261
Epoch: 10, loss: 0.29586, Train accuracy: 0.78309, Test accuracy: 0.74787
train done!!, 1 sec


# 2) Text classification with LSTM advanced

220603

- [ref](https://towardsdatascience.com/text-classification-with-pytorch-7111dae111a6)

In [14]:
# model_type = 'CNN'
model_type = 'RNN'


data_train = 'data/nlp_disaster_tweets/train.csv'
data_test = 'data/nlp_disaster_tweets/test.csv'

class CNN_config_define:
    def __init__(self):
        self.num_words= 2000
        self.max_seq_len = 35
        
        self.embedding_size= 100
        self.out_size= 32
        self.stride= 2
        
        self.batch_size = 12
        self.epochs = 20
        self.learning_rate= 0.001

class LSTM_config_define:
    def __init__(self):
        self.num_words= 2000
        self.max_seq_len = 35
        
        self.embedding_size= 100
        self.hidden_dim= 32  
        self.lstm_layers= 3
        
        self.batch_size = 12
        self.epochs = 20
        self.learning_rate= 0.001
        
if model_type == 'CNN':
    config = CNN_config_define()
elif model_type == 'RNN':
    config = LSTM_config_define()

In [25]:
# define LSTM model
import torch
import torch.nn as nn
import torch.nn.functional as F


class LSTM_Classifier(nn.ModuleList):
    def __init__(self, config):
        super(LSTM_Classifier, self).__init__()

        # Hyperparameters
        self.batch_size = config.batch_size
        self.hidden_dim = config.hidden_dim
        self.LSTM_layers = config.lstm_layers
        self.input_size = config.num_words

        self.dropout = nn.Dropout(0.5)
        # Embedding layer definition
        self.embedding = nn.Embedding(
            self.input_size+ 1, self.hidden_dim, padding_idx=0)
        
        # LSTM
        self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim,
                            num_layers=self.LSTM_layers, batch_first=True)
        # FC
        self.fc1 = nn.Linear(in_features=self.hidden_dim,
                             out_features=self.hidden_dim*2)
        self.fc2 = nn.Linear(self.hidden_dim*2, 1)

    def forward(self, x):

        # Hidden and cell state definion
        h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
        c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))

        # Initialization fo hidden and cell states
        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)

        # Each sequence "x" is passed through an embedding layer
        out = self.embedding(x)
        # Feed LSTMs
        out, (hidden, cell) = self.lstm(out, (h, c))
        out = self.dropout(out)
        # The last hidden state is taken
        out = torch.relu_(self.fc1(out[:, -1, :]))
        out = self.dropout(out)
        out = torch.sigmoid(self.fc2(out))

        return out.squeeze()
    
model = LSTM_Classifier(config)
count_parameters(model)

model parameters: 83105


In [26]:
## train!!
# Initialize loaders
loader_train = DataLoader(dataset_train, batch_size=config.batch_size)
loader_test = DataLoader(dataset_test, batch_size=config.batch_size)

# Define optimizer
optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)

# Starts training phase
start_time = time.time()
for epoch in range(config.epochs):
    # Set model in training model
    model.train()
    predictions = []
    # Starts batch training
    for batch_idx, (x_batch, y_batch) in enumerate(loader_train):

        y_batch = y_batch.type(torch.FloatTensor)

        # Feed the model
        y_pred = model(x_batch)

        # Loss calculation
        loss = F.binary_cross_entropy(y_pred, y_batch)

        # Clean gradientes
        optimizer.zero_grad()

        # Gradients calculation
        loss.backward()

        # Gradients update
        optimizer.step()

        # Save predictions
        predictions += list(y_pred.detach().numpy())

    # Evaluation phase
    test_predictions = evaluation(model, loader_test)

    # Metrics calculation
    train_accuary = calculate_accuray(data['y_train'], predictions)
    test_accuracy = calculate_accuray(data['y_test'], test_predictions)
    print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (
        epoch+1, loss.item(), train_accuary, test_accuracy))
    
print('train done!!, %.1f sec'%(time.time()-start_time))

Epoch: 1, loss: 0.66488, Train accuracy: 0.56502, Test accuracy: 0.57387
Epoch: 2, loss: 0.61992, Train accuracy: 0.58325, Test accuracy: 0.63559
Epoch: 3, loss: 0.61311, Train accuracy: 0.64680, Test accuracy: 0.64741
Epoch: 4, loss: 0.60675, Train accuracy: 0.65550, Test accuracy: 0.65660
Epoch: 5, loss: 0.38914, Train accuracy: 0.70164, Test accuracy: 0.72357
Epoch: 6, loss: 0.43435, Train accuracy: 0.75747, Test accuracy: 0.74261
Epoch: 7, loss: 0.35550, Train accuracy: 0.78522, Test accuracy: 0.74984
Epoch: 8, loss: 0.25207, Train accuracy: 0.80837, Test accuracy: 0.74787
Epoch: 9, loss: 0.37398, Train accuracy: 0.82906, Test accuracy: 0.73145
Epoch: 10, loss: 0.17866, Train accuracy: 0.84204, Test accuracy: 0.76428
train done!!, 7 sec


In [34]:
print('train done!!, %.1f sec'%(time.time()-start_time))

train done!!, 818.1 sec


In [28]:
time.time()-start_time

788.2275884151459