In [1]:
import os
import time
import math
import json
import joblib
import random
import argparse
import numpy as np
import tensorflow as tf
import sys
import csv

from tqdm import tqdm
from functools import partial
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

from classification_model.opt import adam, warmup_cosine, warmup_linear, warmup_constant
#from classification_model.analysis import rocstories as rocstories_analysis
from classification_model.text_utils import TextEncoder
from classification_model.utils import encode_dataset, flatten, iter_data, find_trainable_variables, convert_gradient_to_tensor, shape_list, ResultLogger, assign_to_gpu, average_grads, make_path

In [2]:
train_dataset = 'entailment_test.csv'
test_dataset = 'cluster_0_news_unprocessed.csv'

In [3]:
def gelu(x):
    return 0.5*x*(1+tf.tanh(math.sqrt(2/math.pi)*(x+0.044715*tf.pow(x, 3))))

def swish(x):
    return x*tf.nn.sigmoid(x)

opt_fns = {
    'adam':adam,
}

act_fns = {
    'relu':tf.nn.relu,
    'swish':swish,
    'gelu':gelu
}

lr_schedules = {
    'warmup_cosine':warmup_cosine,
    'warmup_linear':warmup_linear,
    'warmup_constant':warmup_constant,
}

def _norm(x, g=None, b=None, e=1e-5, axis=[1]):
    u = tf.reduce_mean(x, axis=axis, keep_dims=True)
    s = tf.reduce_mean(tf.square(x-u), axis=axis, keep_dims=True)
    x = (x - u) * tf.rsqrt(s + e)
    if g is not None and b is not None:
        x = x*g + b
    return x

def norm(x, scope, axis=[-1]):
    with tf.variable_scope(scope):
        n_state = shape_list(x)[-1]
        g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1))
        b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0))
        return _norm(x, g, b, axis=axis)

def dropout(x, pdrop, train):
    if train and pdrop > 0:
        x = tf.nn.dropout(x, 1-pdrop)
    return x

def mask_attn_weights(w):
    n = shape_list(w)[-1]
    b = tf.matrix_band_part(tf.ones([n, n]), -1, 0)
    b = tf.reshape(b, [1, 1, n, n])
    w = w*b + -1e9*(1-b)
    return w

def _attn(q, k, v, train=False, scale=False):
    w = tf.matmul(q, k)

    if scale:
        n_state = shape_list(v)[-1]
        w = w*tf.rsqrt(tf.cast(n_state, tf.float32))

    w = mask_attn_weights(w)
    w = tf.nn.softmax(w)

    w = dropout(w, attn_pdrop, train)

    a = tf.matmul(w, v)
    return a

def split_states(x, n):
    x_shape = shape_list(x)
    m = x_shape[-1]
    new_x_shape = x_shape[:-1]+[n, m//n]
    return tf.reshape(x, new_x_shape)

def merge_states(x):
    x_shape = shape_list(x)
    new_x_shape = x_shape[:-2]+[np.prod(x_shape[-2:])]
    return tf.reshape(x, new_x_shape)

def split_heads(x, n, k=False):
    if k:
        return tf.transpose(split_states(x, n), [0, 2, 3, 1])
    else:
        return tf.transpose(split_states(x, n), [0, 2, 1, 3])

def merge_heads(x):
    return merge_states(tf.transpose(x, [0, 2, 1, 3]))

def conv1d(x, scope, nf, rf, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), pad='VALID', train=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        w = tf.get_variable("w", [rf, nx, nf], initializer=w_init)
        b = tf.get_variable("b", [nf], initializer=b_init)
        if rf == 1: #faster 1x1 conv
            c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, shape_list(x)[:-1]+[nf])
        else: #was used to train LM
            c = tf.nn.conv1d(x, w, stride=1, padding=pad)+b
        return c

def attn(x, scope, n_state, n_head, train=False, scale=False):
    assert n_state%n_head==0
    with tf.variable_scope(scope):
        c = conv1d(x, 'c_attn', n_state*3, 1, train=train)
        q, k, v = tf.split(c, 3, 2)
        q = split_heads(q, n_head)
        k = split_heads(k, n_head, k=True)
        v = split_heads(v, n_head)
        a = _attn(q, k, v, train=train, scale=scale)
        a = merge_heads(a)
        a = conv1d(a, 'c_proj', n_state, 1, train=train)
        a = dropout(a, resid_pdrop, train)
        return a

def mlp(x, scope, n_state, train=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        act = act_fns[afn]
        h = act(conv1d(x, 'c_fc', n_state, 1, train=train))
        h2 = conv1d(h, 'c_proj', nx, 1, train=train)
        h2 = dropout(h2, resid_pdrop, train)
        return h2

def block(x, scope, train=False, scale=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        a = attn(x, 'attn', nx, n_head, train=train, scale=scale)
        n = norm(x+a, 'ln_1')
        m = mlp(n, 'mlp', nx*4, train=train)
        h = norm(n+m, 'ln_2')
        return h

def embed(X, we):
    we = convert_gradient_to_tensor(we)
    e = tf.gather(we, X)
    h = tf.reduce_sum(e, 2)
    return h

def clf(x, ny, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), train=False):
    with tf.variable_scope('clf'):
        nx = shape_list(x)[-1]
        w = tf.get_variable("w", [nx, ny], initializer=w_init)
        b = tf.get_variable("b", [ny], initializer=b_init)
        return tf.matmul(x, w)+b

def model(X, M, Y, train=False, reuse=False):
    with tf.variable_scope('model', reuse=reuse):
        we = tf.get_variable("we", [n_vocab+n_special+n_ctx, n_embd], initializer=tf.random_normal_initializer(stddev=0.02))
        we = dropout(we, embd_pdrop, train)

        X = tf.reshape(X, [-1, n_ctx, 2])
        M = tf.reshape(M, [-1, n_ctx])

        h = embed(X, we)
        for layer in range(n_layer):
            h = block(h, 'h%d'%layer, train=train, scale=True)

        lm_h = tf.reshape(h[:, :-1], [-1, n_embd])
        lm_logits = tf.matmul(lm_h, we, transpose_b=True)
        lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1]))
        lm_losses = tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1]-1])
        lm_losses = tf.reduce_sum(lm_losses*M[:, 1:], 1)/tf.reduce_sum(M[:, 1:], 1)

        clf_h = tf.reshape(h, [-1, n_embd])
        pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32)
        clf_h = tf.gather(clf_h, tf.range(shape_list(X)[0], dtype=tf.int32)*n_ctx+pool_idx)

        clf_h = tf.reshape(clf_h, [-1, 2, n_embd])
        if train and clf_pdrop > 0:
            shape = shape_list(clf_h)
            shape[1] = 1
            clf_h = tf.nn.dropout(clf_h, 1-clf_pdrop, shape)
        clf_h = tf.reshape(clf_h, [-1, n_embd])
        clf_logits = clf(clf_h, 1, train=train)
        clf_logits = tf.reshape(clf_logits, [-1, 2])

        clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=clf_logits, labels=Y)
        return clf_logits, clf_losses, lm_losses

def mgpu_train(*xs):
    gpu_ops = []
    gpu_grads = []
    xs = (tf.split(x, n_gpu, 0) for x in xs)
    for i, xs in enumerate(zip(*xs)):
        do_reuse = True if i > 0 else None
        with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse):
            clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse)
            if lm_coef > 0:
                train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses)
            else:
                train_loss = tf.reduce_mean(clf_losses)
            params = find_trainable_variables("model")
            grads = tf.gradients(train_loss, params)
            grads = list(zip(grads, params))
            gpu_grads.append(grads)
            gpu_ops.append([clf_logits, clf_losses, lm_losses])
    ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
    grads = average_grads(gpu_grads)
    grads = [g for g, p in grads]
    train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e)
    return [train]+ops

def mgpu_predict(*xs):
    gpu_ops = []
    xs = (tf.split(x, n_gpu, 0) for x in xs)
    for i, xs in enumerate(zip(*xs)):
        with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True):
            clf_logits, clf_losses, lm_losses = model(*xs, train=False, reuse=True)
            gpu_ops.append([clf_logits, clf_losses, lm_losses])
    ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
    return ops

def transform_roc(X1, X2):
    n_batch = len(X1)
    xmb = np.zeros((n_batch, 2, n_ctx, 2), dtype=np.int32)
    mmb = np.zeros((n_batch, 2, n_ctx), dtype=np.float32)
    start = encoder['_start_']
    delimiter = encoder['_delimiter_']
    for i, (x1, x2), in enumerate(zip(X1, X2)):
        x12 = [start]+x1[:max_len]+[delimiter]+x2[:max_len]+[clf_token]
        x13 = [start]+x2[:max_len]+[delimiter]+x1[:max_len]+[clf_token]
        l12 = len(x12)
        l13 = len(x13)
        xmb[i, 0, :l12, 0] = x12
        xmb[i, 1, :l13, 0] = x13
        mmb[i, 0, :l12] = 1
        mmb[i, 1, :l13] = 1
    xmb[:, :, :, 1] = np.arange(n_vocab+n_special, n_vocab+n_special+n_ctx)
    return xmb, mmb

def iter_apply(Xs, Ms, Ys):
    fns = [lambda x:np.concatenate(x, 0), lambda x:float(np.sum(x))]
    results = []
    for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
        n = len(xmb)
        if n == n_batch_train:
            res = sess.run([eval_mgpu_logits, eval_mgpu_clf_loss], {X_train:xmb, M_train:mmb, Y_train:ymb})
        else:
            res = sess.run([eval_logits, eval_clf_loss], {X:xmb, M:mmb, Y:ymb})
        res = [r*n for r in res]
        results.append(res)
    results = zip(*results)
    return [fn(res) for res, fn in zip(results, fns)]

def iter_predict(Xs, Ms):
    logits = []
    #print(Xs.shape, Ms.shape)
    #print("n_batch_train", n_batch_train)
    #sys.exit()
    for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
        n = len(xmb)
        #print("shapes ", xmb.shape, mmb.shape)
        #print(eval_mgpu_logits)
        #print(X_train.shape, M_train.shape)
        if n == n_batch_train:
            logits.append(sess.run(eval_mgpu_logits, {X_train:xmb, M_train:mmb}))
        else:
            logits.append(sess.run(eval_logits, {X:xmb, M:mmb}))
    logits = np.concatenate(logits, 0)
    return logits

def save(path):
    ps = sess.run(params)
    joblib.dump(ps, make_path(path))

def log():
    global best_score
    tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
    va_logits, va_cost = iter_apply(vaX, vaM, vaY)
    tr_cost = tr_cost/len(trY[:n_valid])
    va_cost = va_cost/n_valid
    tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1))*100.
    va_acc = accuracy_score(vaY, np.argmax(va_logits, 1))*100.
    logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
    print('%d %d %.3f %.3f %.2f %.2f'%(n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
    if submit:
        score = va_acc
        if score > best_score:
            best_score = score
            save(os.path.join(save_dir, desc, 'best_params.jl'))

def ed(*splits, encoder):
    encoded_splits = []
    for split in splits[0]:
        fields = []
        #print("Yes")
        #for s in split:
            #print(s)
            #sys.exit()
        for field in split:
            if isinstance(field[0], str):
                field = encoder.encode(field)
            fields.append(field)
        encoded_splits.append(fields)
    return encoded_splits

def _rc(path):
    with open(path) as f:
        f = csv.reader(f)
        st = []
        ct1 = []
        ct2 = []
        y = []
        for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
            if i > 0:
                s = ''.join(line[1])
                c1 = line[2]
                st.append(s)
                ct1.append(c1)
                #print(i, " ", line[3])
                #y.append(int(line[3]))
                y.append(0)
                #print(st, ct1, y)
                #sys.exit()
        return st, ct1, y

def rc(data_dir, n_train=91, n_valid=10):
    event1, event2, y = _rc(os.path.join(data_dir, train_dataset))
    teX1, teX2, _ = _rc(os.path.join(data_dir, test_dataset))
    tr_event1, va_event1, tr_event2, va_event2, tr_y, va_y = train_test_split(event1, event2, y, test_size=n_valid, random_state=seed)
    trX1, trX2, trX3 = [], [], []
    trY = []
    for s, c1, y in zip(tr_event1, tr_event2, tr_y):
        trX1.append(s)
        trX2.append(c1)
        trY.append(y)

    vaX1, vaX2, vaX3 = [], [], []
    vaY = []
    for s, c1, y in zip(va_event1, va_event1, va_y):
        vaX1.append(s)
        vaX2.append(c1)
        vaY.append(y)
        
    trY = np.asarray(trY, dtype=np.int32)
    vaY = np.asarray(vaY, dtype=np.int32)
    return (trX1, trX2, trY), (vaX1, vaX2, vaY), (teX1, teX2)

def analyz(data_dir, pred_path, log_path):
    import pandas as pd
    preds = pd.read_csv(pred_path, delimiter='\t')['prediction'].values.tolist()
    _, _, labels = _rc(os.path.join(data_dir, test_dataset))
    test_accuracy = accuracy_score(labels, preds)*100.
    #logs = [json.loads(line) for line in open(log_path)][1:]
    #best_validation_index = np.argmax([log['va_acc'] for log in logs])
    #valid_accuracy = logs[best_validation_index]['va_acc']
    #print('Valid Accuracy: %.2f'%(valid_accuracy))
    print('Test Accuracy:  %.2f'%(test_accuracy))
    y_pred = np.array(preds)
    y_true = np.array(labels)
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    con_mat = confusion_matrix(y_true, y_pred)
    print(con_mat)
    print(classification_report(y_true, y_pred))
    
def predict_(dataset_):
    filename = filenames[dataset_]
    pred_fn = pred_fns[dataset_]
    label_decoder = label_decoders[dataset_]
    predictions = pred_fn(iter_predict(teX, teM))
    if label_decoder is not None:
        predictions = [label_decoder[prediction] for prediction in predictions]
    path = os.path.join(submission_dir, filename)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    print(len(predictions))
    with open(path, 'w') as f:
        f.write('{}\t{}\n'.format('index', 'prediction'))
        for i, prediction in enumerate(predictions):
            f.write('{}\t{}\n'.format(i, prediction))
    return predictions
            

argmax = lambda x:np.argmax(x, 1)

pred_fns = {
    'rocstories':argmax,
    'eventpair':argmax,
    'entailment':argmax
}

filenames = {
    'rocstories':'ROCStories.tsv',
    'eventpair':'eventpair.tsv',
    'entailment':'entailment.tsv'
}

label_decoders = {
    'rocstories':None,
    'eventpair':None,
    'entailment':None
}
            
parser = argparse.ArgumentParser()
parser.add_argument('--desc', type=str, default='entailment')
parser.add_argument('--dataset', type=str, default='entailment')
parser.add_argument('--log_dir', type=str, default='classification_model/log/')
parser.add_argument('--save_dir', type=str, default='classification_model/save/')
parser.add_argument('--data_dir', type=str, default='data')
parser.add_argument('--submission_dir', type=str, default='classification_model/submission/')
parser.add_argument('--submit', action='store_true', default=True)
parser.add_argument('--analysis', action='store_true', default=True)
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--n_iter', type=int, default=3)
parser.add_argument('--n_batch', type=int, default=8)
parser.add_argument('--max_grad_norm', type=int, default=1)
parser.add_argument('--lr', type=float, default=6.25e-5)
parser.add_argument('--lr_warmup', type=float, default=0.002)
parser.add_argument('--n_ctx', type=int, default=512)
parser.add_argument('--n_embd', type=int, default=768)
parser.add_argument('--n_head', type=int, default=12)
parser.add_argument('--n_layer', type=int, default=12)
parser.add_argument('--embd_pdrop', type=float, default=0.1)
parser.add_argument('--attn_pdrop', type=float, default=0.1)
parser.add_argument('--resid_pdrop', type=float, default=0.1)
parser.add_argument('--clf_pdrop', type=float, default=0.1)
parser.add_argument('--l2', type=float, default=0.01)
parser.add_argument('--vector_l2', action='store_true')
parser.add_argument('--n_gpu', type=int, default=4)
parser.add_argument('--opt', type=str, default='adam')
parser.add_argument('--afn', type=str, default='gelu')
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
parser.add_argument('--encoder_path', type=str, default='classification_model/model/encoder_bpe_40000.json')
parser.add_argument('--bpe_path', type=str, default='classification_model/model/vocab_40000.bpe')
parser.add_argument('--n_transfer', type=int, default=12)
parser.add_argument('--lm_coef', type=float, default=0.5)
parser.add_argument('--b1', type=float, default=0.9)
parser.add_argument('--b2', type=float, default=0.999)
parser.add_argument('--e', type=float, default=1e-8)

args = parser.parse_args(args=[])
#print(args)
globals().update(args.__dict__)
random.seed(seed)
np.random.seed(seed)
tf.set_random_seed(seed)

logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
text_encoder = TextEncoder(encoder_path, bpe_path)
encoder = text_encoder.encoder
n_vocab = len(text_encoder.encoder)

(trX1, trX2, trY), (vaX1, vaX2, vaY), (teX1, teX2) = ed(rc(data_dir), encoder=text_encoder)
n_y = 2
encoder['_start_'] = len(encoder)
encoder['_delimiter_'] = len(encoder)
encoder['_classify_'] = len(encoder)
clf_token = encoder['_classify_']

tf.reset_default_graph()

max_len = n_ctx//2-2
imported_meta = tf.train.import_meta_graph("saved_nodel/my_test_model.meta")
l1_ = [len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(trX1, trX2)]
l2_ = [len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(vaX1, vaX2)]
l3_ = [len(x1[:max_len])+len(x2[:max_len]) for x1, x2 in zip(teX1, teX2)]

n_ctx = min(max(l1_+l2_+l3_)+3, n_ctx)
n_ctx = 256
n_special = 3
n_batch_train = n_batch*n_gpu

trX, trM = transform_roc(trX1, trX2)
vaX, vaM = transform_roc(vaX1, vaX2)
#if submit:
teX, teM = transform_roc(teX1, teX2)

n_train = len(trY)
n_valid = len(vaY)
n_batch_train = n_batch*n_gpu
n_updates_total = (n_train//n_batch_train)*n_iter

X_train = tf.placeholder(tf.int32, [n_batch_train, 2, n_ctx, 2])
M_train = tf.placeholder(tf.float32, [n_batch_train, 2, n_ctx])
X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2])
M = tf.placeholder(tf.float32, [None, 2, n_ctx])

Y_train = tf.placeholder(tf.int32, [n_batch_train])
Y = tf.placeholder(tf.int32, [None])

print(len(trX1), len(trX2), len(trY), len(vaX1), len(vaX2), len(vaY), len(teX1), len(teX2))
print(encoder['_delimiter_'])
print(n_ctx)
print(max_len)

print(trX.shape, trM.shape, vaX.shape, vaM.shape, teX.shape, teM.shape)
print(n_train, n_valid, n_batch_train, n_updates_total)

print(X.shape)
print(M.shape)

print(X_train.shape)
print(M_train.shape)
print(Y_train.shape)

                                                                                

5047 5047 5047 10 10 10 4086 4086
40479
256
254
(5047, 2, 256, 2) (5047, 2, 256) (10, 2, 256, 2) (10, 2, 256) (4086, 2, 256, 2) (4086, 2, 256)
5047 10 32 471
(?, 2, 256, 2)
(?, 2, 256)
(32, 2, 256, 2)
(32, 2, 256)
(32,)


In [4]:
print("0")
train, logits, clf_losses, lm_losses = mgpu_train(X_train, M_train, Y_train)
clf_loss = tf.reduce_mean(clf_losses)
print("1")
params = find_trainable_variables('model')
print("2 ", len(params), params[0])
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
#imported_meta.restore(sess, tf.train.latest_checkpoint('./'))
print("3")
sess.run(tf.global_variables_initializer())
print("4")
shapes = json.load(open('classification_model/model/params_shapes.json'))
offsets = np.cumsum([np.prod(shape) for shape in shapes])
init_params = [np.load('classification_model/model/params_{}.npy'.format(n)) for n in range(10)]
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
init_params[0] = init_params[0][:n_ctx]
init_params[0] = np.concatenate([init_params[1], (np.random.randn(n_special, n_embd)*0.02).astype(np.float32), init_params[0]], 0)
print(len(init_params[0]))
del init_params[1]
print("This is right Here")
print(len(params), params[0], len(init_params), init_params[0])
sess.run([p.assign(ip) for p, ip in zip(params[:n_transfer], init_params[:n_transfer])])
print("Now here we are ")
print(len(params), params[0])
sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))])
print("Now HERRRRRE")

eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train)
eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=True)
eval_clf_loss = tf.reduce_mean(eval_clf_losses)
eval_mgpu_clf_loss = tf.reduce_mean(eval_mgpu_clf_losses)


0

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
can't train model/we:0 None
can't train model/h0/attn/c_attn/w:0 None
can't train model/h0/attn/c_attn/b:0 None
can't train model/h0/attn/c_proj/w:0 None
can't train model/h0/attn/c_proj/b:0 None
can't train model/h0/ln_1/g:0 None
can't train model/h0/ln_1/b:0 None
can't train model/h0/mlp/c_fc/w:0 None
can't train model/h0/mlp/c_fc/b:0 None
can't train model/h0/mlp/c_proj/w:0 None
can't train model/h0/mlp/c_proj/b:0 None
can't train model/h0/ln_2/g:0 None
can't train model/h0/ln_2/b:0 None
can't train model/h1/attn/c_attn/w:0 None
can't train model/h1/attn/c_attn/b:0 None
can't train model/h1/attn/c_proj/w:0 None
can't train model/h1/attn/c_proj/b:0 None
can't train model/h1/ln_1/g:0 None
can't train model/h1/ln_1/b:0 None
can't train model/h1/mlp/c_fc/w:0 None
can't train m

In [15]:
def predic_(dataset_):
    filename = filenames[dataset_]
    pred_fn = pred_fns[dataset_]
    label_decoder = label_decoders[dataset_]
    iter_pred = iter_predict(teX, teM)
    print(iter_pred)
    #sys.exit()
    predictions = pred_fn(iter_pred)
    if label_decoder is not None:
        predictions = [label_decoder[prediction] for prediction in predictions]
    path = os.path.join(submission_dir, filename)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    print(len(predictions))
    with open(path, 'w') as f:
        f.write('{}\t{}\n'.format('index', 'prediction'))
        for i, prediction in enumerate(predictions):
            f.write('{}\t{}\n'.format(i, prediction))
    return predictions, iter_pred

In [16]:
#sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))])
pred, iter_pred = predic_('entailment')
print(pred.shape)
#analyz(data_dir, os.path.join(submission_dir, 'entailment.tsv'), os.path.join(log_dir, 'entailment.jsonl'))

                                                                                

[[-0.55000126 -0.4926156 ]
 [ 0.07413381  0.02719757]
 [-0.19589825 -0.16819851]
 ...
 [-0.811062   -0.7719482 ]
 [ 0.06596909  0.08129586]
 [ 0.00469982  0.0413203 ]]
4086
(4086,)




In [27]:
def get_predictions(iter_pred):
    diff = [abs(i[0] - i[1]) for i in iter_pred]
    #print(diff)
    pred_labels = []
    diff = np.array(diff)
    index = np.where(diff > 0.10)[0]
    for d, row in zip(diff, iter_pred):
        if(d>0.1):
            if(row[0] < row[1]):
                pred_labels.append(1)
            else:
                pred_labels.append(0)
        else:
            pred_labels.append(0)


    pred_labels = np.array(pred_labels)
    index = np.where(pred_labels == 1)[0]
    return pred_labels

    #print(index, diff.shape)

In [30]:
import pandas as pd
d = pd.read_csv('data/cluster_0_news_unprocessed.csv')
d = d.iloc[index]
print(d['sent_id'])

6         6
104     117
106     119
110     123
111     124
113     126
220      38
222      40
225      43
263      94
264      95
313     144
316     147
326     157
355     199
373       9
392      28
395      31
413      50
456      94
563     203
572     212
579     264
585     270
603     288
607     292
714     402
717     405
720     408
732     420
       ... 
3759    103
3780    124
3782    126
3808    152
3812    156
3827    171
3897    241
3898    242
3899    243
3900    244
3901    245
3904    248
3930    274
3936    280
3937    281
3938    282
3939    283
3940    284
3942    286
3943    287
3945    289
3968    312
3970    314
3972    316
3979    323
3980    324
4009    353
4019    363
4047    401
4079    433
Name: sent_id, Length: 146, dtype: int64


In [8]:
print(pred.shape)
d['pred'] = pred
d.to_csv('data/cluster_0_news_unprocessed_with_labels.csv')

(4086,)


In [9]:
unique, counts = np.unique(pred, return_counts=True)
print(pred)
print(unique, counts)

[1 0 1 ... 1 1 1]
[0 1] [2043 2043]
