In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import numpy as np
import pickle
from collections import defaultdict
import sys, re
import pandas as pd
import re
from tqdm import tqdm
import matplotlib.pyplot as plt

home_dir = '/content/drive/My Drive/11747/'

# dataset path
train_path = home_dir + 'topicclass/topicclass_train.txt'
valid_path = home_dir + 'topicclass/topicclass_valid.txt'
test_path = home_dir + 'topicclass/topicclass_test.txt'

# Data Pre-process

In [0]:
def load_data(filename):
  text = []
  labels = []
  length_vec = []
  word_dict = defaultdict(int)
  with open(filename, "r") as f:
      for line in f:
          tag, words = line.strip().split(" ||| ")
          # remove weird words as such @
          s = re.sub(r"[^A-Za-z0-9()!#$%&*+,./:;<=>?\[\\\]{|~}–\"\-\'\`]", " ", words)
          text.append(s)
          labels.append(tag)
          s_split = s.split()
          for w in s_split:
              word_dict[w] += 1
          length_vec.append(len(s_split))
      return text, labels, word_dict, length_vec

In [14]:
x_train, y_train, word_dict_train, length_vec_train = load_data(train_path)
print("total train set: ", len(x_train))
print('number of all vocab', len(word_dict_train))

total train set:  253909
number of all vocab 131801


In [5]:
num_classes = len(set(y_train))
label2idx_dict = {}
idx2label = {}
i = 0
for label in set(y_train):
  label2idx_dict[label] = i
  idx2label[i] = label
  i += 1

label2idx_dict

{'Agriculture, food and drink': 4,
 'Art and architecture': 7,
 'Engineering and technology': 14,
 'Geography and places': 10,
 'History': 5,
 'Language and literature': 1,
 'Mathematics': 6,
 'Media and drama': 2,
 'Miscellaneous': 0,
 'Music': 13,
 'Natural sciences': 3,
 'Philosophy and religion': 11,
 'Social sciences and society': 15,
 'Sports and recreation': 12,
 'Video games': 9,
 'Warfare': 8}

In [6]:
idx2label

{0: 'Miscellaneous',
 1: 'Language and literature',
 2: 'Media and drama',
 3: 'Natural sciences',
 4: 'Agriculture, food and drink',
 5: 'History',
 6: 'Mathematics',
 7: 'Art and architecture',
 8: 'Warfare',
 9: 'Video games',
 10: 'Geography and places',
 11: 'Philosophy and religion',
 12: 'Sports and recreation',
 13: 'Music',
 14: 'Engineering and technology',
 15: 'Social sciences and society'}

In [16]:
def get_word2idx(dic, min_freq=3):
  index_dic = {}
  i = 0
  index_dic['<pad>'] = i # 0 for pad
  i += 1
  index_dic['<unk>'] = i # 1 for unk
  i += 1
  for k,v in dic.items():
    if v >= min_freq:
      index_dic[k] = i
      i += 1
  return index_dic

word2idx_train_orig = get_word2idx(word_dict_train)
print("length of indexed vocab", len(word2idx_train_orig))

length of indexed vocab 61867


In [439]:
n_features = 300

# encode sentence to a list of int without padding
def text_to_vector_no_pad(text):
  # remove weird characters
  s = re.sub(r"[^A-Za-z0-9()!#$%&*+,./:;<=>?\[\\\]{|~}\"\-\'\`]", " ", text)
  sent = s.split()
  ret = [1]*len(sent)
  for i in range(len(sent)):
    wd = sent[i]
    if wd in word2idx_train:
      ret[i] = word2idx_train[wd]
  return ret



def get_feature_no_pad(data):
  feat = []
  for sent in data:
    v = text_to_vector_no_pad(sent)
    feat.append(v)
  return np.array(feat)

feature_train = get_feature_no_pad(x_train)
feature_train

array([list([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 16, 17, 18, 19, 20, 13, 21, 22, 14, 23, 24, 18, 19, 20, 13, 25, 26, 27, 28, 29, 30, 18, 19, 20, 13, 14, 31, 9, 18, 32, 19, 20, 13, 33, 34, 35, 14, 36, 37, 38]),
       list([39, 40, 41, 42, 35, 43, 3, 44, 45, 46, 47, 18, 48, 49, 35, 14, 50, 51, 13, 52, 53, 54, 35, 55, 38]),
       list([56, 57, 58, 59, 13, 60, 61, 62, 18, 35, 63, 64, 3, 65, 66, 27, 67, 14, 68, 69, 30, 46, 19, 37, 70, 3, 71, 72, 32, 73, 74, 3, 19, 75, 76, 3, 77, 1, 38]),
       ...,
       list([56, 4292, 165, 2368, 13, 1471, 3823, 27, 2645, 38, 3330, 35, 30, 3125, 32, 2003, 3823, 27, 801, 38, 3870, 35, 30, 12017, 32, 8808, 4954, 13, 19, 933, 38]),
       list([3018, 229, 18, 19, 31330, 3, 9055, 6585, 12696, 344, 35, 5435, 32, 4571, 18, 46, 54844, 4517, 2740, 34041, 18, 29669, 54633, 18, 1, 14608, 952, 54633, 18, 32, 47634, 54633, 107, 35, 380, 63, 67, 14, 1049, 3, 2974, 763, 199, 1635, 38]),
       list([284, 4597, 4919, 1072, 19, 2787, 197, 724, 18, 811, 

In [440]:
def encode_label(label):
  encode = []
  for l in label:
    if l == 'Media and darama': l = 'Media and drama'
    encode.append(label2idx_dict[l])
  return np.array(encode)

y_train_encode = encode_label(y_train)
y_train_encode

array([ 2,  9,  2, ...,  3, 11,  8])

In [442]:
x_valid, y_valid, _, length_vec_valid = load_data(valid_path)
print("total valid set: ", len(x_valid))

feature_valid = get_feature_no_pad(x_valid)
y_valid_encode = encode_label(y_valid)

total valid set:  643


In [443]:
x_test, y_test_fake, _, _ = load_data(test_path)
print("total test set: ", len(x_test))

feature_test = get_feature_no_pad(x_test)

total test set:  697


In [0]:
# save and load data
def save(data, title):
  pickle.dump(data, open( home_dir + title + ".pkl", "wb" ))

def load(title):
  data = pickle.load(open( home_dir + title + ".pkl", "rb" )) 
  return data

In [0]:
def get_word_embedding_matrix(word2idx_dict):
    # randomize embedding
    M = (np.random.rand(len(word2idx_dict), 300)-0.5)/2
    pretrain_used = 0
    with open(home_dir + 'cc.en.300.vec', 'r') as f:
      total_pretrained_vec, dim = f.readline().split()
      print("total pretrained vec:", total_pretrained_vec, "dim:", dim)
      for line in f:
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx_dict:
          M[word2idx_dict[word]] = np.array(tokens[1:])
          pretrain_used += 1

      print("%s out of %s (%f%%) words has pretrained embedding" % (pretrain_used, len(word2idx_dict), pretrain_used/len(word2idx_dict)) )

      return M

In [0]:
embedding = get_word_embedding_matrix(word2idx_train)
save(embedding, "embedding")

In [9]:
embedding = load("embedding")
embedding.shape

(61867, 300)

# Model Train

### Cite: https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/

In [0]:
import torch   
from torchtext import data    
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [0]:
# train dataframe to csv
df_train = pd.DataFrame(list(zip(x_train, y_train_encode)), columns =['text', 'label'])
df_train.to_csv(home_dir+"train_topicclass.csv")

In [171]:
# load data from csv
tokenize = lambda x: x.split()

TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True)
LABEL = Field(sequential=False, use_vocab=False)

fields = [(None, None), ('text',TEXT),('label', LABEL)]
train_data=data.TabularDataset(path = home_dir+'train_topicclass.csv',format = 'csv',fields = fields,skip_header = True)

print(vars(train_data.examples[0]))

{'text': ['several', 'of', 'these', 'rights', 'regulate', 'pre', '-', 'trial', 'procedure', ':', 'access', 'to', 'a', 'non', '-', 'excessive', 'bail', ',', 'the', 'right', 'to', 'indictment', 'by', 'a', 'grand', 'jury', ',', 'the', 'right', 'to', 'an', 'information', '(', 'charging', 'document', ')', ',', 'the', 'right', 'to', 'a', 'speedy', 'trial', ',', 'and', 'the', 'right', 'to', 'be', 'tried', 'in', 'a', 'specific', 'venue', '.'], 'label': '2'}


In [0]:
# valid dataframe to csv
df_valid = pd.DataFrame(list(zip(x_valid, y_valid_encode)), columns =['text', 'label'])
df_valid.to_csv(home_dir+"valid_topicclass.csv")

In [172]:
# load from csv
valid_data=data.TabularDataset(path = home_dir+'valid_topicclass.csv',format = 'csv',fields = fields,skip_header = True)
print(vars(valid_data.examples[0]))

{'text': ['the', 'm', 'ori', 'players', 'initially', 'provoked', 'curiosity', 'due', 'to', 'their', 'race', ',', 'but', 'the', 'british', 'press', 'subsequently', 'expressed', 'some', 'surprise', 'that', 'the', 'side', 'was', 'not', 'as', '"', 'm', 'ori', '"', 'as', 'they', 'had', 'expected', '.'], 'label': '9'}


In [0]:
# test dataframe to csv
df_test = pd.DataFrame(list(zip(x_test, [16]*len(x_test))), columns =['text', 'label'])
df_test.to_csv(home_dir+"test_topicclass.csv")

In [173]:
# load from csv
test_data=data.TabularDataset(path = home_dir+'test_topicclass.csv',format = 'csv',fields = fields,skip_header = True)
print(vars(test_data.examples[0]))

{'text': ['ny', '93', 'was', 'moved', 'onto', 'ny', '104', 'and', 'junction', 'road', 'in', 'cambria', 'in', 'the', '1940s', ',', 'and', 'altered', 'to', 'bypass', 'lockport', 'to', 'the', 'south', 'on', 'a', 'new', 'highway', 'and', 'robinson', 'and', 'dysinger', 'roads', 'in', '1991', '.'], 'label': '16'}


In [174]:
# glove embeddings
TEXT.build_vocab(train_data,min_freq=3, vectors = "glove.6B.300d")  
# TEXT.build_vocab(train_data, min_freq=3)  
LABEL.build_vocab(train_data)

# number of unique tokens
print("Size of text vocab:",len(TEXT.vocab))

# number of unique label
print("Size of label:",len(LABEL.vocab)) 

Size of text vocab: 54083
Size of label: 17


In [0]:
word2idx_train = TEXT.vocab.stoi

In [0]:

def get_word_embedding_matrix(word2idx_dict):
    # randomize embedding
    M = (np.random.rand(len(word2idx_dict), 300)-0.5)/2
    pretrain_used = 0
    with open(home_dir + 'cc.en.300.vec', 'r') as f:
      total_pretrained_vec, dim = f.readline().split()
      print("total pretrained vec:", total_pretrained_vec, "dim:", dim)
      for line in f:
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx_dict:
          M[word2idx_dict[word]] = np.array(tokens[1:])
          pretrain_used += 1

      print("%s out of %s (%f) words has pretrained embedding" % (pretrain_used, len(word2idx_dict), pretrain_used/len(word2idx_dict)) )
      return M

embedding = get_word_embedding_matrix(word2idx_train)


In [48]:
save(embedding, "embedding_no_pad_lower")
embedding.shape

(54083, 300)

In [0]:
# check cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 32

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [0]:
class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        text = torch.transpose(text, 0, 1) # [batch size, sent_length]
        
        embedded = self.embedding(text) # [batch size, sent_len, emb dim]
        
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        # concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim = 1) # [batch size, hid dim * num directions]
        
        dense_outputs = self.fc(hidden)

        # Final activation function
        outputs=self.act(dense_outputs)

        return outputs

In [188]:
#define hyperparameters
n_features = 300
num_classes = 16

size_of_vocab = len(TEXT.vocab)
embedding_dim = n_features
num_output_nodes = num_classes
num_hidden_nodes = 128
num_layers = 2
dropout = 0.3
lr = 0.0005

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional=True, dropout = dropout)
print(model)

classifier(
  (embedding): Embedding(54083, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=256, out_features=16, bias=True)
  (act): Sigmoid()
)


In [189]:
#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
print(pretrained_embeddings.shape)


# embedding = load("embedding_no_pad_lower")
# embedding = torch.from_numpy(embedding)
# model.embedding.weight.data.copy_(embedding)
# embedding.shape

torch.Size([54083, 300])


In [0]:
#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# define metric
def calculate_accuracy(preds, y):
    predict = torch.argmax(preds, dim=-1)
    acc = torch.sum(predict == y).float()/len(predict)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = calculate_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def save_model(model, ep):
  torch.save(model.state_dict(), home_dir+"no_pad_ass1_epoch"+str(ep)+".pt")

In [0]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = calculate_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [192]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), home_dir+'no_pad_best_epoch.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    save_model(model, epoch)
    print("saving model", epoch)

	Train Loss: 2.118 | Train Acc: 57.45%
	 Val. Loss: 2.008 |  Val. Acc: 79.91%
saving model 0
	Train Loss: 2.013 | Train Acc: 78.40%
	 Val. Loss: 2.006 |  Val. Acc: 80.11%
saving model 1
	Train Loss: 1.985 | Train Acc: 82.96%
	 Val. Loss: 2.001 |  Val. Acc: 83.48%
saving model 2
	Train Loss: 1.971 | Train Acc: 84.99%
	 Val. Loss: 2.002 |  Val. Acc: 80.06%
saving model 3
	Train Loss: 1.960 | Train Acc: 86.41%
	 Val. Loss: 2.014 |  Val. Acc: 80.41%
saving model 4
	Train Loss: 1.953 | Train Acc: 87.38%
	 Val. Loss: 2.006 |  Val. Acc: 80.95%
saving model 5
	Train Loss: 1.948 | Train Acc: 88.15%
	 Val. Loss: 2.026 |  Val. Acc: 78.47%
saving model 6
	Train Loss: 1.943 | Train Acc: 88.77%
	 Val. Loss: 2.034 |  Val. Acc: 78.92%
saving model 7
	Train Loss: 1.940 | Train Acc: 89.36%
	 Val. Loss: 2.023 |  Val. Acc: 78.92%
saving model 8
	Train Loss: 1.937 | Train Acc: 89.80%
	 Val. Loss: 2.017 |  Val. Acc: 81.25%
saving model 9


In [193]:
def model_load(name):
  model =  classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional=True, dropout = dropout)
  model.load_state_dict(torch.load(home_dir+name))
  model.eval()
  model.to(device)
  return model


model = model_load("no_pad_best_epoch.pt")

valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
print(valid_acc)

0.8348214285714286


In [0]:
#Load an iterator
test_iterator = data.BucketIterator(
    test_data, 
    batch_size = 64,
    sort_key = lambda x: len(x.text),
    sort_within_batch=False,
    device = device,
    train=False, shuffle = None)

In [0]:
TEXT_to_word = TEXT.vocab.itos
word_to_TEXT = TEXT.vocab.stoi

In [0]:
with open(home_dir+"dev_results.txt", "w") as f:
  for test in valid_data:
    t = torch.LongTensor([[word_to_TEXT[w]] for w in test.text]).cuda()
    
    text_lengths = torch.LongTensor([len(test.text)])
    predictions = model(t, text_lengths).squeeze()
    predict = torch.argmax(predictions, dim=-1)
    f.write(idx2label[predict.item()]+"\n")

In [0]:
with open(home_dir+"test_results.txt", "w") as f:
  for test in test_data:
    t = torch.LongTensor([[word_to_TEXT[w]] for w in test.text]).cuda()
    
    text_lengths = torch.LongTensor([len(test.text)])
    predictions = model(t, text_lengths).squeeze()
    predict = torch.argmax(predictions, dim=-1)
    f.write(idx2label[predict.item()]+"\n")