In [1]:
import json
import os
import timeit

import numpy as np
import pandas as pd
import string

from scipy import stats

import nltk

from collections import Counter

import time

### load samples

In [2]:
data_dir = ".."
train_path = os.path.join(data_dir, "training.json")
dev_path = os.path.join(data_dir, "development.json")
test_path = os.path.join(data_dir, "test.json")

In [3]:
with open(train_path) as fin:
    j = json.load(fin)
    
train_raw = j["data"]

with open(dev_path) as fin:
    j = json.load(fin)
    
dev_raw = j["data"]

In [4]:
tokset_stopwords = set(nltk.corpus.stopwords.words("english"))
tokset_punct = set(string.punctuation)

tokset_stopwords.update(tokset_punct)
tokset_stopwords = sorted(tokset_stopwords)

sample_dim = len(tokset_stopwords) + 1 # for label
print(sample_dim)

212


### generate np dataset

In [5]:
def gen_np_data(data_raw):
    
    qids = set()
    
    for passage in data_raw:
        title = passage['title']
        paragraphs = passage['paragraphs']
        for paragraph in paragraphs:
            context = paragraph['context']
            qas = paragraph['qas']
            for qa in qas:
                question = qa['question']
                tokset_question = set(nltk.tokenize.word_tokenize(question.lower()))
                qid = qa['id']
                qids.add(qid)
    
    sample_num = len(qids)
    
    data = np.zeros((sample_num, sample_dim))
    
    i = 0
    for passage in data_raw:
        title = passage['title']
        paragraphs = passage['paragraphs']
        for paragraph in paragraphs:
            context = paragraph['context']
            qas = paragraph['qas']
            for qa in qas:
                question = qa['question']
                tokset_question = set(nltk.tokenize.word_tokenize(question.lower()))
                tokset_local = [tok for tok in tokset_question if tok in tokset_stopwords]
                toknum_local = len(tokset_local) if tokset_local else 1
                c = Counter(tokset_local)

                for j, k in enumerate(tokset_stopwords):
                    data[i, j] = c.get(k, 0) / toknum_local
                data[i, -1] = int(qa["is_impossible"])

                if i % 10000 == 0:
                    print(i)
                i += 1    
    
    return data

### nn

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [7]:
class NNClassifier(nn.Module):
    def __init__(self, hidden_num=1, dropout_p=None,
                 input_dim=212, hidden_dim=256, class_num=2):
        super(NNClassifier, self).__init__()
        # loss
        self.loss = nn.CrossEntropyLoss()
        # hidden-hidden fcs
        self.hiddens = [nn.Linear(input_dim, hidden_dim) for _ in range(hidden_num-1)]
        # insert input-hidden fc
        self.hiddens.insert(0, nn.Linear(input_dim, hidden_dim))
        # dropout layers
        self.dropout_p = dropout_p
        if dropout_p is not None:
            self.drops = [nn.Dropout(p=dropout_p) for _ in range(hidden_num)]
        # output layer
        self.out = nn.Linear(hidden_dim, class_num)
        # dropout
        
        
    def forward(self, x):
        for i in range(len(self.hiddens)):
            x = F.relu(self.hiddens[i](x))
            if self.dropout_p is not None:
                x = self.drops[i](x)
        x = self.out(x)
        x = x.squeeze()
        val, idx = torch.max(x, dim=1)
        return x, idx
    
    def compute_loss(self, pred_vec, gold_vec):
        return self.loss(pred_vec, gold_vec)

In [25]:
def make_a_try(X_train, X_test, Y_train, Y_test,
               hidden_num, dropout_p, lr, epoch_num,
               batch_size, debug_mode=True):
    debug_report_seg = epoch_num // 10
    train_size, input_dim = X_train.shape
    model = NNClassifier(input_dim=input_dim, dropout_p=dropout_p)
#     optimizer = optim.SGD(model.parameters(), lr=lr)
    optimizer = optim.Adam(model.parameters())
    optimizer.zero_grad()
    model.train()
    start_train = time.time()
    batch_num = train_size // batch_size
    for epoch in range(epoch_num):
        acc = 0
        loss = 0
        correct = 0
        p_total = 0
        r_total = 0
        for batch in range(batch_num):
            X_batch = X_train[batch*batch_size:(batch+1)*batch_size]
            Y_batch = Y_train[batch*batch_size:(batch+1)*batch_size]
            optimizer.zero_grad()
            inputs = torch.tensor(X_batch).float()
            golds  = torch.tensor(Y_batch).long()
            pred_vals, pred_labels  = model(inputs)
            batch_loss = model.compute_loss(pred_vals, golds)
            batch_loss.backward()
            optimizer.step()
            
            if debug_mode and epoch % debug_report_seg == 0:
#                 # debug only
#                 print((pred_labels == golds and pred_labels == 1).sum())
#                 raise RuntimeError("DEBUGGING")
                
                
                acc += golds.eq(pred_labels).sum().float() / batch_size
                loss += batch_loss
                pred_labels = pred_labels.numpy()
                golds = golds.numpy()
                correct += (np.logical_and(pred_labels == golds, pred_labels == 1)).sum()
                p_total += (pred_labels == 1).sum()
                r_total += (golds == 1).sum()

        if debug_mode and epoch % debug_report_seg == 0:
            p = correct / p_total
            r = correct / r_total
            f = 2*p*r / (p+r)
            print("epoch {:>5d}, loss = {:.4f}, p = {:.4f}, r = {:.4f}, f = {:.4f}, acc = {:.6f}"\
                  .format(epoch, loss/batch_num, p, r, f, acc/batch_num))


#     acc = golds.eq(pred_labels).sum().float() / train_size
#     print("training: loss = {}, acc = {}".format(loss, acc))

    model.eval()

    inputs = torch.tensor(X_train).float()
    golds  = torch.tensor(Y_train).long()
    pred_vals, pred_labels  = model(inputs)
    loss = model.compute_loss(pred_vals, golds)
    acc = golds.eq(pred_labels).sum().float() / train_size
    print("training: loss = {}, acc = {}".format(loss, acc))

    
    test_size, input_dim = X_test.shape
    inputs = torch.tensor(X_test).float()
    golds  = torch.tensor(Y_test).long()
    pred_vals, pred_labels  = model(inputs)
    loss = model.compute_loss(pred_vals, golds)
    acc = golds.eq(pred_labels).sum().float() / test_size
    print("test: loss = {}, acc = {}".format(loss, acc))
    
    return model

### get data

In [9]:
train_np = gen_np_data(train_raw)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [10]:
print(set(train_np[:, -1].tolist()))
print(train_np.shape)

{0.0, 1.0}
(100814, 212)


In [11]:
dev_np = gen_np_data(dev_raw)

0
10000
20000


In [12]:
print(set(dev_np[:, -1].tolist()))
print(dev_np.shape)

{0.0, 1.0}
(29505, 212)


In [13]:
X_train = train_np[:, :-1]
Y_train = train_np[:, -1]
X_dev = dev_np[:, :-1]
Y_dev = dev_np[:, -1]

In [26]:
# 1 layer, no dropout
model_ch = make_a_try(X_train, X_dev, Y_train, Y_dev, 
                      hidden_num=1, dropout_p=None, lr=1, epoch_num=50,
                      batch_size=5000, debug_mode=True)

epoch     0, loss = 0.6866, p = 0.2641, r = 0.1624, f = 0.2011, acc = 0.567940




epoch     5, loss = 0.6387, p = nan, r = 0.0000, f = nan, acc = 0.665100
epoch    10, loss = 0.6373, p = nan, r = 0.0000, f = nan, acc = 0.665100
epoch    15, loss = 0.6371, p = nan, r = 0.0000, f = nan, acc = 0.665100


KeyboardInterrupt: 

---

### count of stop words

In [15]:
from collections import Counter

c0 = Counter()
tot0 = 0
c1 = Counter()
tot1 = 0

i = 0

for passage in data:
    title = passage['title']
    paragraphs = passage['paragraphs']
    for paragraph in paragraphs:
        context = paragraph['context']
        qas = paragraph['qas']
        for qa in qas:
            question = qa['question']
            tokset_question = set(nltk.tokenize.word_tokenize(question.lower()))
            qid = qa['id']
            
            c = Counter((tok for tok in tokset_question if tok in tokset_stopwords))
            
            label = int(qa["is_impossible"])
            if label == 1:
                c1 += c
                tot1 += 1
            else:
                c0 += c
                tot0 += 1

            if i % 1000 == 0:
                print(i)
            i += 1

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000


In [22]:
keys = set(c0.keys())
keys.update(c1.keys())
keys = sorted(keys)

for k in keys:
    print(f"{k:>10s} : {c0[k]/tot0:.4f}, {c1[k]/tot1:.4f}, {c0[k]/tot0-c1[k]/tot1:.4f}")

         ! : 0.0002, 0.0001, 0.0001
         # : 0.0001, 0.0003, -0.0001
         $ : 0.0005, 0.0013, -0.0008
         % : 0.0016, 0.0037, -0.0020
         & : 0.0014, 0.0012, 0.0002
         ' : 0.0094, 0.0043, 0.0051
         ( : 0.0014, 0.0011, 0.0003
         ) : 0.0014, 0.0011, 0.0002
         * : 0.0000, 0.0000, 0.0000
         + : 0.0001, 0.0001, -0.0000
         , : 0.0672, 0.0397, 0.0274
         - : 0.0003, 0.0003, 0.0001
         . : 0.0042, 0.0023, 0.0019
         / : 0.0000, 0.0000, 0.0000
         : : 0.0007, 0.0008, -0.0001
         ; : 0.0003, 0.0003, -0.0000
         < : 0.0000, 0.0000, 0.0000
         = : 0.0000, 0.0001, -0.0000
         > : 0.0002, 0.0001, 0.0001
         ? : 0.9892, 0.9922, -0.0030
         [ : 0.0001, 0.0000, 0.0001
         \ : 0.0000, 0.0000, -0.0000
         ] : 0.0001, 0.0000, 0.0001
         ` : 0.0000, 0.0000, 0.0000
         a : 0.1168, 0.1128, 0.0040
     about : 0.0126, 0.0122, 0.0004
     above : 0.0008, 0.0011, -0.0003
     after : 0.016

In [21]:
len(tokset_stopwords)

211

### load word2vec

In [2]:
import gensim



In [3]:
WORD2VEC_PATH = "we/GoogleNews-vectors-negative300.bin"

In [4]:
w2v = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

In [5]:
we_len = 300
def get_vector(word):
    global w2v, we_len
    return w2v.get_vector(word) if word in w2v.vocab \
            else np.zeros((we_len, ))