In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
gap_train= pd.read_pickle('./temp_result/train_kaggle_processed_PPA_PCA_PPA')
gap_test= pd.read_pickle('./temp_result/test_kaggle_processed_PPA_PCA_PPA')
gap_valid= pd.read_pickle('./temp_result/valid_kaggle_processed_PPA_PCA_PPA')

In [3]:
NUM_TRAIN = gap_train.count().values[0]
NUM_TEST = gap_test.count().values[0]
NUM_VALID = gap_valid.count().values[0]
def label(A,B):
    if A is True:
        return 0
    if B is True:
        return 1
    return 2
def switch_label(l):
    if l==2:
        return 2
    return 1-l

In [4]:
def process_prediction(pred):
    s = pred.shape[0]//2
    pred0 = pred[0:s,:]
    pred1 = pred[s:,:]
    pred1 = pred1[:,[1,0,2]]
    pred_out = pred0+pred1
    return pred_out/2

In [5]:
def switch_A_B(df):
    columnsTitles = ["B_dist","A_dist","B_pos", "A_pos","pron_pos", "B_vector", "A_vector","pron_vector","product_vector_B","product_vector_A","label"]
    df2=df.reindex(columns=columnsTitles).copy()
    df2.columns = df.columns
    df2.label = df2.label.map(switch_label)
    return pd.concat([df,df2],axis = 0, sort = False)

In [6]:
import torch
def compute_loss(sub_df,test_data):
    pred = torch.Tensor(np.log(sub_df.loc[:,['A','B','NEITHER']].values))
    label = torch.LongTensor(list(test_data.label))
    loss = torch.nn.NLLLoss()
    loss_value = loss(pred,label).item()
    return loss_value

In [7]:
train_data = gap_train.drop(columns = ['ID', 'Text', 'Pronoun','Pronoun-offset', 'A', 'A-offset',
       'B', 'B-offset', 'URL', 'tokens', 'token_map',
       'sentence_map', 'pron_idx'])
train_data.A_vector = train_data.A_vector.map(lambda x:np.mean(x,axis = 0))
train_data.B_vector = train_data.B_vector.map(lambda x:np.mean(x,axis = 0))
train_data.pron_vector = train_data.pron_vector.map(lambda x:np.mean(x,axis = 0))
train_data["product_vector_A"] = train_data.A_vector*train_data.pron_vector
train_data["product_vector_B"] = train_data.B_vector*train_data.pron_vector
train_data["label"] = train_data.apply(lambda x:label(x["A-coref"],x["B-coref"]),axis = 1)
train_data = train_data.drop(columns= ["A-coref","B-coref"])

In [8]:
test_data = gap_test.drop(columns = ['ID', 'Text', 'Pronoun','Pronoun-offset', 'A', 'A-offset',
       'B', 'B-offset', 'URL', 'tokens', 'token_map',
       'sentence_map', 'pron_idx'])
test_data.A_vector = test_data.A_vector.map(lambda x:np.mean(x,axis = 0))
test_data.B_vector = test_data.B_vector.map(lambda x:np.mean(x,axis = 0))
test_data.pron_vector = test_data.pron_vector.map(lambda x:np.mean(x,axis = 0))
test_data["product_vector_A"] = test_data.A_vector*test_data.pron_vector
test_data["product_vector_B"] = test_data.B_vector*test_data.pron_vector
test_data["label"] = test_data.apply(lambda x:label(x["A-coref"],x["B-coref"]),axis = 1)
test_data = test_data.drop(columns= ["A-coref","B-coref"])

In [9]:
valid_data = gap_valid.drop(columns = ['ID', 'Text', 'Pronoun','Pronoun-offset', 'A', 'A-offset',
       'B', 'B-offset', 'URL', 'tokens', 'token_map',
       'sentence_map', 'pron_idx'])
valid_data.A_vector = valid_data.A_vector.map(lambda x:np.mean(x,axis = 0))
valid_data.B_vector = valid_data.B_vector.map(lambda x:np.mean(x,axis = 0))
valid_data.pron_vector = valid_data.pron_vector.map(lambda x:np.mean(x,axis = 0))
valid_data["product_vector_A"] = valid_data.A_vector*valid_data.pron_vector
valid_data["product_vector_B"] = valid_data.B_vector*valid_data.pron_vector
valid_data["label"] = valid_data.apply(lambda x:label(x["A-coref"],x["B-coref"]),axis = 1)
valid_data = valid_data.drop(columns= ["A-coref","B-coref"])

In [10]:
import torch.nn as nn

In [11]:
import torch.nn.functional as F
def masked_softmax(logits, mask, dim=-1, log_softmax=False):
    """Take the softmax of `logits` over given dimension, and set
    entries to 0 wherever `mask` is 0.

    Args:
        logits (torch.Tensor): Inputs to the softmax function.
        mask (torch.Tensor): Same shape as `logits`, with 0 indicating
            positions that should be assigned 0 probability in the output.
        dim (int): Dimension over which to take softmax.
        log_softmax (bool): Take log-softmax rather than regular softmax.
            E.g., some PyTorch functions such as `F.nll_loss` expect log-softmax.

    Returns:
        probs (torch.Tensor): Result of taking masked softmax over the logits.
    """
    mask = mask.type(torch.float32)
    masked_logits = mask * logits + (1 - mask) * -1e30
    softmax_fn = F.log_softmax if log_softmax else F.softmax
    probs = softmax_fn(masked_logits, dim)

    return probs
class BiDAFAttention(nn.Module):
    """Bidirectional attention originally used by BiDAF.

    Bidirectional attention computes attention in two directions:
    The context attends to the query and the query attends to the context.
    The output of this layer is the concatenation of [context, c2q_attention,
    context * c2q_attention, context * q2c_attention]. This concatenation allows
    the attention vector at each timestep, along with the embeddings from
    previous layers, to flow through the attention layer to the modeling layer.
    The output has shape (batch_size, context_len, 8 * hidden_size).

    Args:
        hidden_size (int): Size of hidden activations.
        drop_prob (float): Probability of zero-ing out activations.
    """
    def __init__(self, hidden_size, drop_prob=0.6):
        super(BiDAFAttention, self).__init__()
        self.drop_prob = drop_prob
        self.c_weight = nn.Parameter(torch.zeros(hidden_size, 1))
        self.q_weight = nn.Parameter(torch.zeros(hidden_size, 1))
        self.cq_weight = nn.Parameter(torch.zeros(1, 1, hidden_size))
        for weight in (self.c_weight, self.q_weight, self.cq_weight):
            nn.init.xavier_uniform_(weight)
        self.bias = nn.Parameter(torch.zeros(1))
        self.drop1 = nn.Dropout(self.drop_prob)  # (bs, c_len, hid_size)
        self.drop2 = nn.Dropout(self.drop_prob)
        self.output_layer1 = nn.Linear(4*hidden_size,64)
        self.drop3 = nn.Dropout(self.drop_prob)
        self.output_layer2 = nn.Linear(64,1)
        self.drop4 = nn.Dropout(self.drop_prob)
        nn.init.xavier_uniform_(self.output_layer1.weight)
        nn.init.xavier_uniform_(self.output_layer2.weight)

        
    def forward(self, c, q, c_mask, q_mask):
        batch_size, c_len, _ = c.size()
        q_len = q.size(1)
        s = self.get_similarity_matrix(c, q)        # (batch_size, c_len, q_len)
        c_mask = c_mask.view(batch_size, c_len, 1)  # (batch_size, c_len, 1)
        q_mask = q_mask.view(batch_size, 1, q_len)  # (batch_size, 1, q_len)
        s1 = masked_softmax(s, q_mask, dim=2)       # (batch_size, c_len, q_len)
        s2 = masked_softmax(s, c_mask, dim=1)       # (batch_size, c_len, q_len)

        # (bs, c_len, q_len) x (bs, q_len, hid_size) => (bs, c_len, hid_size)
        a = torch.bmm(s1, q)
        # (bs, c_len, c_len) x (bs, c_len, hid_size) => (bs, c_len, hid_size)
        b = torch.bmm(torch.bmm(s1, s2.transpose(1, 2)), c)

        
        x = torch.cat([c, a, c * a, c * b], dim=2)  # (bs, c_len, 4 * hid_size)
        
        x = self.drop3(x)
        
        x = self.output_layer1(x)
        
        x = torch.nn.ReLU()(x)
        
        x = self.drop4(x)
        
        x = self.output_layer2(x)
        
        return x.squeeze(-1)

    def get_similarity_matrix(self, c, q):
        """Get the "similarity matrix" between context and query (using the
        terminology of the BiDAF paper).

        A naive implementation as described in BiDAF would concatenate the
        three vectors then project the result with a single weight matrix. This
        method is a more memory-efficient implementation of the same operation.

        See Also:
            Equation 1 in https://arxiv.org/abs/1611.01603
        """
        c_len, q_len = c.size(1), q.size(1)
        c = self.drop1(c)  # (bs, c_len, hid_size)
        q = self.drop2(q)  # (bs, q_len, hid_size)
        #print (c.size())
        #print (q.size())
        # Shapes: (batch_size, c_len, q_len)
        s0 = torch.matmul(c, self.c_weight).expand([-1, -1, q_len])
        s1 = torch.matmul(q, self.q_weight).transpose(1, 2)\
                                           .expand([-1, c_len, -1])
        s2 = torch.matmul(c * self.cq_weight, q.transpose(1, 2))
        s = s0 + s1 + s2 + self.bias

        return s


In [12]:
bidaf = BiDAFAttention(256).cuda()
EPOCHS = 150
batch_size = 25
loss_fn = torch.nn.CrossEntropyLoss(weight = torch.Tensor([1.0,1.0,10.0])).cuda()
opt = torch.optim.Adam(bidaf.parameters(), lr=0.001)

In [13]:
valid_data.head()

Unnamed: 0,vector,A_dist,B_dist,A_pos,B_pos,pron_pos,A_idx,B_idx,A_vector,B_vector,pron_vector,product_vector_A,product_vector_B,label
0,"[[0.85816926, -0.45762736, -0.08299036, -0.275...",0.018,0.004,0.693548,0.806452,0.83871,"[52, 53, 54, 55, 56, 57, 58, 59]","[63, 64, 65]","[0.20037952, 0.2805402, -0.11013528, -0.492183...","[0.6041415, -0.13538514, 0.15207358, -0.076126...","[0.8586822, -1.2192798, 0.09194927, -0.4327072...","[0.17206234, -0.342057, -0.010126858, 0.212971...","[0.51876557, 0.16507237, 0.013983054, 0.032940...",2
1,"[[0.2511816, 0.24685939, -0.3399855, -0.624934...",0.03,0.012,0.511111,0.711111,0.844444,"[34, 35]","[46, 47, 48]","[0.067682624, 0.42009318, -0.009991955, -0.659...","[-0.438681, 0.48596978, -0.02240046, 0.0839553...","[-0.67510414, -0.39920956, 0.07794422, 0.43378...","[-0.04569282, -0.16770521, -0.0007788151, -0.2...","[0.29615536, -0.19400378, -0.0017459863, 0.036...",1
2,"[[0.8131293, 0.4440992, 0.7549018, 0.4792695, ...",0.022,0.01,0.405128,0.435897,0.461538,"[89, 90, 91, 92, 93, 94, 95]","[99, 100]","[-0.40055174, -0.06318157, 0.08668772, -0.0661...","[0.67819536, 0.21544528, -0.39938855, 0.618967...","[0.72358626, -1.0747175, 0.050702423, -0.89089...","[-0.28983372, 0.06790234, 0.0043952777, 0.0588...","[0.49073285, -0.23154281, -0.020249967, -0.551...",1
3,"[[1.033058, -0.108201064, -0.1557167, 0.394294...",0.012,0.004,0.201521,0.21673,0.224335,"[66, 67, 68]","[73, 74, 75]","[-0.33478984, -0.63129133, -0.5514479, 0.59391...","[0.50680375, 0.71950006, -0.25772676, 0.375783...","[1.2850071, 0.049930945, -0.26918668, 0.253453...","[-0.43020734, -0.031520974, 0.14844243, 0.1505...","[0.6512464, 0.035925318, 0.06937661, 0.0952435...",0
4,"[[1.3460386, 0.037629873, 0.55879486, -0.42726...",0.056,0.042,0.321839,0.362069,0.482759,"[71, 72, 73, 74]","[80, 81, 82, 83]","[0.25996214, -0.15495634, -0.5617872, -0.04960...","[0.2925861, 0.2669702, 0.03382411, -0.3578499,...","[-0.87992877, -0.07835998, 0.12989467, -0.2304...","[-0.22874817, 0.012142375, -0.07297316, 0.0114...","[-0.2574549, -0.020919777, 0.0043935715, 0.082...",1


In [14]:
from torch.nn.utils.rnn import pad_sequence
for e in range(EPOCHS):
    for b in range(0,train_data.shape[0],batch_size):
        bidaf.train()
        batch_data = train_data.vector[b:b+batch_size]
        batch_label = train_data.label[b:b+batch_size]
        batch_pron = train_data.pron_vector[b:b+batch_size]
        batch_pron = torch.Tensor(np.array(list(batch_pron.values))).unsqueeze(1).cuda()
        batch_data = pad_sequence([torch.Tensor(v) for v in batch_data]).cuda().transpose(0,1)
        batch_padding = batch_data.mean(dim=1,keepdim = True)#torch.zeros(batch_data.size()[0],1,batch_data.size()[2]).cuda()*0.001
        batch_data = torch.cat([batch_padding,batch_data],dim = 1)
        batch_label = torch.LongTensor(list(batch_label)).cuda()
        c_mask = torch.zeros_like(batch_data.mean(-1,keepdim = True)) != batch_data.mean(-1,keepdim = True)
        q_mask = torch.zeros_like(batch_pron.mean(-1,keepdim = True)) != batch_pron.mean(-1,keepdim = True)
        c_mask = c_mask.cuda()
        q_mask = q_mask.cuda()
        #print (batch_data.mean(-1,keepdim = True).shape)
        output = bidaf(batch_data,batch_pron,c_mask,q_mask)
        mask_A = [np.array(v)+1 for v in list(train_data.A_idx[b:b+batch_size])]
        mask_B = [np.array(v)+1 for v in list(train_data.B_idx[b:b+batch_size])]
        #neither_prob = output[:,0]
        prob_list = []
        for i,(v_A,v_B) in enumerate(zip(mask_A,mask_B)):
            v_A = torch.LongTensor(v_A).cuda()
            A_prob_ = output[i,v_A].sum()
            v_B = torch.LongTensor(v_B).cuda()
            B_prob_ = output[i,v_B].sum()
            #other_prob = output[i,:].sum() - A_prob_ - B_prob_
            other_prob = output[i,0].sum()
            prob_list.append(torch.cat([A_prob_.view(1,1),B_prob_.view(1,1),other_prob.view(1,1)]).view(-1,3))
        #print (prob_list)
        pred_train = torch.cat(prob_list,dim = 0)
        #print (pred_train.size())
            
        #batch_label = torch.LongTensor(batch_label).cuda()
        loss = loss_fn(pred_train,batch_label)
        #l2_norm = torch.norm(mlp.layers[-1].weight, p=2)
        #loss += l2_norm*0.09
        #l2_norm = torch.norm(mlp.layers[0].weight, p=2)
        #loss += l2_norm*0.03
        opt.zero_grad()
        loss.backward()
        opt.step()

    print('epoch [{}/{}], loss:{:.4f}'.format(e + 1, EPOCHS, loss.item()))

epoch [1/150], loss:1.0879
epoch [2/150], loss:1.1244
epoch [3/150], loss:0.8314
epoch [4/150], loss:0.6250
epoch [5/150], loss:0.9473
epoch [6/150], loss:0.9767
epoch [7/150], loss:0.4623
epoch [8/150], loss:0.6780
epoch [9/150], loss:0.6725
epoch [10/150], loss:0.5636
epoch [11/150], loss:0.5046
epoch [12/150], loss:0.8632
epoch [13/150], loss:0.4844
epoch [14/150], loss:0.4487
epoch [15/150], loss:0.6339
epoch [16/150], loss:0.6400
epoch [17/150], loss:0.6340
epoch [18/150], loss:0.2961
epoch [19/150], loss:0.5056
epoch [20/150], loss:0.4926
epoch [21/150], loss:0.4274
epoch [22/150], loss:0.4474
epoch [23/150], loss:0.4875
epoch [24/150], loss:0.4926
epoch [25/150], loss:0.6400
epoch [26/150], loss:0.5452
epoch [27/150], loss:0.4732
epoch [28/150], loss:0.5090
epoch [29/150], loss:0.2909
epoch [30/150], loss:0.5318
epoch [31/150], loss:0.9475
epoch [32/150], loss:0.5565
epoch [33/150], loss:0.3499
epoch [34/150], loss:0.3302
epoch [35/150], loss:0.4981
epoch [36/150], loss:0.2509
e

In [16]:
pred_bidaf_train = []    
for b in range(0,train_data.shape[0],batch_size):
    bidaf.eval()
    batch_data = train_data.vector[b:b+batch_size]
    batch_label = train_data.label[b:b+batch_size]
    batch_pron = train_data.pron_vector[b:b+batch_size]
    batch_pron = torch.Tensor(np.array(list(batch_pron.values))).unsqueeze(1).cuda()
    batch_data = pad_sequence([torch.Tensor(v) for v in batch_data]).cuda().transpose(0,1)
    batch_padding = batch_data.mean(dim=1,keepdim = True)#batch_padding = torch.zeros(batch_data.size()[0],1,batch_data.size()[2]).cuda()*0.001
    batch_data = torch.cat([batch_padding,batch_data],dim = 1)
    batch_label = torch.LongTensor(list(batch_label)).cuda()
    c_mask = torch.zeros_like(batch_data.mean(-1,keepdim = True)) != batch_data.mean(-1,keepdim = True)
    q_mask = torch.zeros_like(batch_pron.mean(-1,keepdim = True)) != batch_pron.mean(-1,keepdim = True)
    c_mask = c_mask.cuda()
    q_mask = q_mask.cuda()
    #print (batch_data.mean(-1,keepdim = True).shape)
    output = bidaf(batch_data,batch_pron,c_mask,q_mask)
    mask_A = [np.array(v)+1 for v in list(train_data.A_idx[b:b+batch_size])]
    mask_B = [np.array(v)+1 for v in list(train_data.B_idx[b:b+batch_size])]
    #neither_prob = output[:,0]
    prob_list = []
    for i,(v_A,v_B) in enumerate(zip(mask_A,mask_B)):
        v_A = torch.LongTensor(v_A).cuda()
        A_prob_ = output[i,v_A].sum()
        v_B = torch.LongTensor(v_B).cuda()
        B_prob_ = output[i,v_B].sum()
        #other_prob = output[i,:].sum() - A_prob_ - B_prob_
        other_prob = output[i,0].sum()
        prob_list.append(torch.cat([A_prob_.view(1,1),B_prob_.view(1,1),other_prob.view(1,1)]).view(-1,3))
    #print (prob_list)
    pred_bidaf_ = torch.cat(prob_list,dim = 0)
    pred_bidaf_train.append(pred_bidaf_)
pred_bidaf_train = torch.nn.Softmax(dim=1)(torch.cat(pred_bidaf_train,dim = 0)).cpu().data.numpy()

In [17]:
pred_bidaf = []    
for b in range(0,test_data.shape[0],batch_size):
    bidaf.eval()
    batch_data = test_data.vector[b:b+batch_size]
    batch_label = test_data.label[b:b+batch_size]
    batch_pron = test_data.pron_vector[b:b+batch_size]
    batch_pron = torch.Tensor(np.array(list(batch_pron.values))).unsqueeze(1).cuda()
    batch_data = pad_sequence([torch.Tensor(v) for v in batch_data]).cuda().transpose(0,1)
    batch_padding = batch_data.mean(dim=1,keepdim = True)#batch_padding = torch.zeros(batch_data.size()[0],1,batch_data.size()[2]).cuda()*0.001
    batch_data = torch.cat([batch_padding,batch_data],dim = 1)
    batch_label = torch.LongTensor(list(batch_label)).cuda()
    c_mask = torch.zeros_like(batch_data.mean(-1,keepdim = True)) != batch_data.mean(-1,keepdim = True)
    q_mask = torch.zeros_like(batch_pron.mean(-1,keepdim = True)) != batch_pron.mean(-1,keepdim = True)
    c_mask = c_mask.cuda()
    q_mask = q_mask.cuda()
    #print (batch_data.mean(-1,keepdim = True).shape)
    output = bidaf(batch_data,batch_pron,c_mask,q_mask)
    mask_A = [np.array(v)+1 for v in list(test_data.A_idx[b:b+batch_size])]
    mask_B = [np.array(v)+1 for v in list(test_data.B_idx[b:b+batch_size])]
    #neither_prob = output[:,0]
    prob_list = []
    for i,(v_A,v_B) in enumerate(zip(mask_A,mask_B)):
        v_A = torch.LongTensor(v_A).cuda()
        A_prob_ = output[i,v_A].sum()
        v_B = torch.LongTensor(v_B).cuda()
        B_prob_ = output[i,v_B].sum()
        #other_prob = output[i,:].sum() - A_prob_ - B_prob_
        other_prob = output[i,0].sum()
        prob_list.append(torch.cat([A_prob_.view(1,1),B_prob_.view(1,1),other_prob.view(1,1)]).view(-1,3))
    #print (prob_list)
    pred_bidaf_ = torch.cat(prob_list,dim = 0)
    pred_bidaf.append(pred_bidaf_)
pred_bidaf = torch.nn.Softmax(dim=1)(torch.cat(pred_bidaf,dim = 0)).cpu().data.numpy()

In [18]:
import pickle
f = open( "./temp_result/biDAF_result_base", "wb" )
pickle.dump(pred_bidaf_train,  f)
pickle.dump(pred_bidaf,  f)
f.close()

In [19]:
pred_bidaf = np.clip(pred_bidaf,1e-15,1-1e-15)

In [20]:
#pred_lr = process_prediction(pred_lr)
sub_df = pd.read_csv("./test_and_submit/sample_submission_stage_1.csv")
sub_df.loc[:, ['A','B','NEITHER']] = pred_bidaf


sub_df.to_csv("./test_and_submit/submission+model+bidaf@"+str(datetime.datetime.now())+".csv", index=False)

sub_df.head(38)


Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.9126874,0.04990904,0.037404
1,development-2,0.9975595,0.0005647483,0.001876
2,development-3,0.01082062,0.9656751,0.023504
3,development-4,0.1281967,0.4960704,0.375733
4,development-5,2.326543e-07,0.9984231,0.001577
5,development-6,0.9995282,0.0003123801,0.000159
6,development-7,0.3293931,0.1664367,0.50417
7,development-8,0.0144415,0.9832012,0.002357
8,development-9,7.350466e-05,0.9980374,0.001889
9,development-10,0.1202069,0.870706,0.009087


In [21]:
compute_loss(sub_df,test_data)

0.5191367864608765