In [1]:
import torch
import pandas as pd
import numpy as np
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern=r"\s|[\,']", gaps=True)
import re 
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
import gensim.downloader as api
from gensim.models import Word2Vec

torch.manual_seed(1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ananda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ananda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<torch._C.Generator at 0x23c8d517310>

# Load Dataset

In [2]:
train = pd.read_csv("train.csv", header = 0)
test = pd.read_csv("test_without_labels.csv", header = 0)
val = pd.read_csv("val.csv", header = 0)

train_sent = train['sents'].tolist()
train_labels = train['labels'].tolist()

test_sent = test['sents'].tolist()

val_sent = val['sents'].tolist()
val_labels = val['labels'].tolist()

# Preprocessing

## Lower-Case

In [3]:
train_sent = [x.lower() for x in train_sent]
test_sent = [x.lower() for x in test_sent]
val_sent = [x.lower() for x in val_sent]

# Tokenize

In [4]:
train_sent_token = [tokenizer.tokenize(s) for s in train_sent]
train_labels_token = [tokenizer.tokenize(s) for s in train_labels]
test_sent_token = [tokenizer.tokenize(s) for s in test_sent]
val_sent_token = [tokenizer.tokenize(s) for s in val_sent]
val_labels_token = [tokenizer.tokenize(s) for s in val_labels]

# Generate Word and Tag Index

In [5]:
word_to_ix = {}
for sentence in train_sent_token+val_sent_token+test_sent_token:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in train_labels_token+val_labels_token:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

## Create index tokens

In [6]:
def to_index(sent, to_ix):
    input_index_list = []
    for word in sent:
        input_index_list.append([to_ix[w] for w in word])
    return input_index_list

train_input_index =  to_index(train_sent_token,word_to_ix)
train_output_index = to_index(train_labels_token,tag_to_ix)
val_input_index = to_index(val_sent_token,word_to_ix)
val_output_index = to_index(val_labels_token,tag_to_ix)
test_input_index = to_index(test_sent_token,word_to_ix)

# Embedding

## Aspect 1: Syntatic Feature (PoS Tagging, Dependancy Path, etc)

In [7]:
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default)
]

regexp_tagger = nltk.RegexpTagger(patterns)

pos_tag = []

for token in train_sent_token+val_sent_token+test_sent_token:
    tagged = regexp_tagger.tag(token)
    pos_tag.append(tagged)

## Aspect 2: Semantic Textual Feature (Word2Vec)

#### Word2Vec with dataset

###### Word2Vec only

In [7]:
embed_train_sent = train_sent_token+val_sent_token+test_sent_token

word_set = set()

for paragraph in embed_train_sent:
  for words in paragraph:
    word_set.add(words)

word_list = list(word_set)
word_list.sort()

word_index = {}
ind = 0
for word in word_list:
  word_index[word] = ind
  ind += 1


word2vec_emb_model = Word2Vec(sentences=embed_train_sent, window=5, min_count=5, workers=2, sg=0)


emb_dim = word2vec_emb_model.vector_size
EMBEDDING_DIM = word2vec_emb_model.vector_size
embedding_matrix = []

for word in word_list:
    try:
        embedding_matrix.append(word2vec_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

(11242, 100)

###### Word2Vac concat with POS tag

In [8]:
import gensim.downloader as api

embed_train_sent = pos_tag

word_set = set()

for paragraph in embed_train_sent:
  for words in paragraph:
    word_set.add(words)

word_list = list(word_set)
word_list.sort()

word_index = {}
ind = 0
for word in word_list:
  word_index[word] = ind
  ind += 1

from gensim.models import Word2Vec

word2vec_emb_model = Word2Vec(sentences=embed_train_sent, window=5, min_count=5, workers=2, sg=0)

import numpy as np
emb_dim = word2vec_emb_model.vector_size
EMBEDDING_DIM = word2vec_emb_model.vector_size
embedding_matrix = []

for word in word_list:
    try:
        embedding_matrix.append(word2vec_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

(11242, 100)

## Aspect 3: Domain Feature (Dota Heroes List)

In [6]:
heroes_df = pd.read_json('heroes.json')
# heroes_df['sid'] = heroes_df.index
# heroes_df['color'] = heroes_df.roles.map(color)
# N_HEROES = len(heroes_df)
heroes_df.head()

Unnamed: 0,id,name,localized_name,primary_attr,attack_type,roles,legs
0,1,npc_dota_hero_antimage,Anti-Mage,agi,Melee,"[Carry, Escape, Nuker]",2
1,2,npc_dota_hero_axe,Axe,str,Melee,"[Initiator, Durable, Disabler, Jungler, Carry]",2
2,3,npc_dota_hero_bane,Bane,int,Ranged,"[Support, Disabler, Nuker, Durable]",4
3,4,npc_dota_hero_bloodseeker,Bloodseeker,agi,Melee,"[Carry, Disabler, Jungler, Nuker, Initiator]",2
4,5,npc_dota_hero_crystal_maiden,Crystal Maiden,int,Ranged,"[Support, Disabler, Nuker, Jungler]",2


In [7]:
list_heroes_names = heroes_df['localized_name'].tolist()

In [8]:
embed_train_sent = train_sent_token+val_sent_token+test_sent_token

word_set = set()

for paragraph in embed_train_sent:
  for words in paragraph:
    word_set.add(words)

word_list = list(word_set)
word_list.sort()

word_index = {}
ind = 0
for word in word_list:
  word_index[word] = ind
  ind += 1


word2vec_emb_model = Word2Vec(sentences=list_heroes_names, window=5, min_count=5, workers=2, sg=0)


emb_dim = word2vec_emb_model.vector_size
EMBEDDING_DIM = word2vec_emb_model.vector_size
embedding_matrix = []

for word in word_list:
    try:
        embedding_matrix.append(word2vec_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

(14014, 100)

# Slot Filling/Tagging Model

## Baseline model

#### Model & Helper Functions

In [10]:
## Helper Function

def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)



def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


def cal_acc(model, input_index, output_index):
  ground_truth = []
  predicted = []

  for i, index in enumerate(input_index):
    ground_truth += output_index[i]

    _, preds = model(torch.tensor(index, dtype=torch.long).to(device))

    predicted += preds
  accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)


  return predicted, ground_truth, accuracy

#### Bi-LSTM CRF baseline Model with different embeddings

In [11]:
def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()


def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)


        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):

        init_alphas = torch.full((1, self.tagset_size), -10000.)

        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.


        forward_var = init_alphas

        for feat in feats:
            alphas_t = []  
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                trans_score = self.transitions[next_tag].view(1, -1)
                next_tag_var = forward_var + trans_score + emit_score
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = [] 
            viterbivars_t = []

            for next_tag in range(self.tagset_size):
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        lstm_feats = self._get_lstm_features(sentence)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

#### Training and testing the model 

#### word2vec only

In [17]:
import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-4)

for epoch in range(2):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()

    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f" %(epoch+1, train_loss,train_acc))

Epoch:1, Training loss: 127374.40, train acc: 0.6775
Epoch:2, Training loss: 87147.43, train acc: 0.8131


In [18]:
y_pred, y_true, _ = cal_acc(model,val_input_index,val_output_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

              precision    recall  f1-score   support

           C     0.5455    0.0037    0.0073      1641
           D     0.0000    0.0000    0.0000       398
           O     0.7986    0.9955    0.8862     18985
           P     0.7500    0.6448    0.6934      3936
           S     0.7666    0.6180    0.6843      3322
        SEPA     0.9956    0.9981    0.9968      3603
           T     0.5000    0.0014    0.0027      1469

    accuracy                         0.8123     33354
   macro avg     0.6223    0.4659    0.4673     33354
weighted avg     0.7758    0.8123    0.7626     33354



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
torch.save(model, 'bilstm_crf_w2v.pt')

model = torch.load('bilstm_crf_w2v.pt')

#### word2vec on dataset and POS tag

In [12]:
import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-4)

for epoch in range(2):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
  

Epoch:1, Training loss: 127977.05, train acc: 0.6776, val loss: 33864.08, val acc: 0.6798, time: 404.47s
Epoch:2, Training loss: 87314.67, train acc: 0.8129, val loss: 21772.78, val acc: 0.8111, time: 405.23s


In [13]:
y_pred, y_true, _ = cal_acc(model,val_input_index,val_output_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

              precision    recall  f1-score   support

           C     0.4000    0.0024    0.0048      1641
           D     0.0000    0.0000    0.0000       398
           O     0.8064    0.9963    0.8913     18985
           P     0.7347    0.6634    0.6972      3936
           S     0.7223    0.5756    0.6406      3322
        SEPA     0.9737    0.9967    0.9851      3603
           T     0.0000    0.0000    0.0000      1469

    accuracy                         0.8105     33354
   macro avg     0.5196    0.4620    0.4599     33354
weighted avg     0.7425    0.8105    0.7601     33354



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
torch.save(model, 'bilstm_crf_w2v_pos.pt')

model = torch.load('bilstm_crf_w2v_pos.pt')

#### word2vec (heroes only)

In [12]:
import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-4)

for epoch in range(2):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()

    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f" %(epoch+1, train_loss,train_acc))

Epoch:1, Training loss: 126924.80, train acc: 0.6748
Epoch:2, Training loss: 97976.24, train acc: 0.7042


In [13]:
y_pred, y_true, _ = cal_acc(model,val_input_index,val_output_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

              precision    recall  f1-score   support

           C     0.0000    0.0000    0.0000      1641
           D     0.0000    0.0000    0.0000       398
           O     0.6671    0.9952    0.7988     18985
           P     0.5385    0.0640    0.1144      3936
           S     0.8636    0.2477    0.3850      3322
        SEPA     0.9961    0.9989    0.9975      3603
           T     0.0000    0.0000    0.0000      1469

    accuracy                         0.7066     33354
   macro avg     0.4379    0.3294    0.3280     33354
weighted avg     0.6369    0.7066    0.6143     33354



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
torch.save(model, 'bilstm_crf_w2v_dataset_heroes.pt')

model = torch.load('bilstm_crf_w2v_dataset_heroes.pt')