In [None]:
from torchtext import data
import random
import matplotlib.pyplot as plt
import seaborn as sns
import re

def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove punctuation
        text = re.sub('[!#?,.:";]|-', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        cleaned_text.append(text)
    return cleaned_text

def get_files(path, dev_size, max_document_length, seed, data_type, tokenizer):
    # include_lengths = True - This will cause batch.text to now be a tuple with the first element being our sentence (a numericalized tensor that has been padded) and the second element being the actual lengths of our sentences.
    Text = data.Field(preprocessing=cleanup_text, tokenize=tokenizer, batch_first=True, include_lengths=True, fix_length=max_document_length) # fix_length - make the sentences padded in the same lengths for all the batches
    Label = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

    # All files:
    fields = [('text', Text), ('labels', Label)]

    train_data, test_data = data.TabularDataset.splits(
        path=path,
        train= data_type + '_train.tsv',
        test= data_type + '_test.tsv',
        format='tsv',
        fields=fields,
        skip_header=False
    )

    train_data, valid_data = train_data.split(split_ratio=dev_size, random_state=random.seed(seed))
    print(f'Number of training examples: {len(train_data)}')
    print(f'Number of validation examples: {len(valid_data)}')
    print(f'Number of testing examples: {len(test_data)}')
    return train_data, valid_data, test_data, Text, Label

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTM(nn.Module):

    # define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim1, hidden_dim2, output_dim, n_layers,
                 bidirectional, dropout, pad_index):
        # Constructor
        super().__init__()

        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)

        # lstm layer
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim1,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        self.fc1 = nn.Linear(hidden_dim1 * 2, hidden_dim2)
        self.fc2 = nn.Linear(hidden_dim2, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        # activation function
        self.act = nn.Softmax() #\ F.log_softmax(outp)

    def forward(self, text, text_lengths):
        # text = [batch size,sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]

        # packed sequence
        packed_embedded = pack_padded_sequence(embedded, text_lengths, batch_first=True) # unpad

        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # packed_output shape = (batch, seq_len, num_directions * hidden_size)
        # hidden shape  = (num_layers * num_directions, batch, hidden_size)

        # concat the final forward and backward hidden state
        cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # output, output_lengths = pad_packed_sequence(packed_output)  # pad the sequence to the max length in the batch

        rel = self.relu(cat)
        dense1 = self.fc1(rel)

        drop = self.dropout(dense1)
        preds = self.fc2(drop)

        # Final activation function
        # preds = self.act(preds)
        # preds = preds.argmax(dim=1).unsqueeze(0)
        return preds
    
    
    #sequence rep, classification - hidden'ı geri dönecek
    #sequence1_hidden = sequence_model text1
    #sequence2_hidden = sequence_model text2
    #son concat stepini dışarıya çıkart 41'den sonrası

In [None]:
import torch
from torch.autograd import Variable
from torchtext import data

def accuracy(probs, target):
  winners = probs.argmax(dim=1)
  corrects = (winners == target)
  accuracy = corrects.sum().float() / float(target.size(0))
  return accuracy

######################################## Using torchText ######################################

def create_iterator(train_data, valid_data, test_data, batch_size, device):
    #  BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
    # by setting sort_within_batch = True.
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
        batch_size = batch_size,
        sort_key = lambda x: len(x.text), # Sort the batches by text length size
        sort_within_batch = True,
        device = device)
    return train_iterator, valid_iterator, test_iterator


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        optimizer.zero_grad()
        # retrieve text and no. of words
        text, text_lengths = batch.text

        predictions = model(text, text_lengths)
        loss = criterion(predictions, batch.labels.squeeze())

        acc = accuracy(predictions, batch.labels)

        # perform backpropagation
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)

            loss = criterion(predictions, batch.labels)

            acc = accuracy(predictions, batch.labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion, model_type):
    best_valid_loss = float('inf')

    for epoch in range(epochs):

        # train the model
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights'+'_'+model_type+'.pt')

        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

In [None]:
import torch
import torch.nn as nn
import os

if __name__ == "__main__":

    # placing the tensors on the GPU if one is available.
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
    path = 'TUBITAK_Project'
    path_data = os.path.join(path, "data")

    # parameters
    model_type = "LSTM"
    data_type = "morph" # or: "token"

    char_based = True
    if char_based:
        tokenizer = lambda s: list(s) # char-based
    else:
        tokenizer = lambda s: s.split() # word-based


    # hyper-parameters:
    lr = 1e-4
    batch_size = 50
    dropout_keep_prob = 0.5
    embedding_size = 300
    max_document_length = 100  # each sentence has until 100 words
    dev_size = 0.8 # split percentage to train\validation data
    max_size = 5000 # maximum vocabulary size
    seed = 1
    num_classes = 3

    # dropout_keep_prob, embedding_size, batch_size, lr, dev_size, vocabulary_size, max_document_length, input_size, hidden_size, output_dim, n_filters, filter_sizes, num_epochs = get_params(model_type)
    train_data, valid_data, test_data, Text, Label = get_files(path_data, dev_size, max_document_length, seed, data_type, tokenizer)

    # Build_vocab : It will first create a dictionary mapping all the unique words present in the train_data to an
    # index and then after it will use word embedding (random, Glove etc.) to map the index to the corresponding word embedding.
    Text.build_vocab(train_data, max_size=max_size)
    Label.build_vocab(train_data)
    vocab_size = len(Text.vocab)

    train_iterator, valid_iterator, test_iterator = create_iterator(train_data, valid_data, test_data, batch_size, device)

    # loss function
    loss_func = nn.CrossEntropyLoss()

In [None]:
if __name__ == "__main__":
    if (model_type == "LSTM"):

        num_hidden_nodes = 93
        hidden_dim2 = 128
        num_layers = 2  # LSTM layers
        bi_directional = True
        num_epochs = 7

        to_train = True
        pad_index = Text.vocab.stoi[Text.pad_token]

        # Build the model
        lstm_model = LSTM(vocab_size, embedding_size, num_hidden_nodes, hidden_dim2 , num_classes, num_layers,
                       bi_directional, dropout_keep_prob, pad_index)

        # optimization algorithm
        optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)

        # train and evaluation
        if (to_train):
            # train and evaluation
            run_train(num_epochs, lstm_model, train_iterator, valid_iterator, optimizer, loss_func, model_type)

        # load weights
        lstm_model.load_state_dict(torch.load("saved_weights_LSTM.pt"))
        # predict
        test_loss, test_acc = evaluate(lstm_model, test_iterator, loss_func)
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')