In [1]:
import os
import numpy as np
from scipy.special import logsumexp
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import pickle
import json
import regex as re

import matplotlib.pyplot as plt
plt.style.use("ggplot")

print(tf.__version__)

2.11.0


In [2]:
TAGS =  pickle.load(open( "tags.pickle", "rb" ))
TAGS.remove("O") 
NUM_TAGS = len(TAGS)

In [20]:
tag2id = {}
for id,label in enumerate(TAGS):
    tag2id[label] = id 

def label2id(labels):
    ret = []
    prev_label = ""
    for label in labels:
        if label == "O":
            ret.append(str(2*NUM_TAGS))
        elif label == prev_label:
            l = [tag2id[t]+ NUM_TAGS for t in label]
            for x in l:
                ret.append(str(x))
        else:
            l =[tag2id[t] for t in label]
            for x in l:
                ret.append(str(x))
        prev_label = label
    return ret 

In [5]:
def get_label(label_id):
    if label_id == (2*NUM_TAGS):
        return "O"
    elif label_id >= NUM_TAGS:
        return [TAGS[label_id-NUM_TAGS]]
    else:
        return TAGS[label_id]

def id2label(labels):
    ret = []
    for label in labels:
        l = [get_label(x) for x in label]
        if len(l) == 1 and l[0] == "O":
            l = "O"
        ret.append(l)
    return ret 

def clean_text(sent):
    ret_sent= []
    for txt in sent:
      fil_txt = re.sub('[^A-Za-z0-9]+', '', str(txt))
      if len(fil_txt) == 0:
        fil_txt  = txt [0]
      ret_sent.append(fil_txt)
    assert(len(ret_sent) == len(sent))
    return ret_sent

In [6]:
NUM_STATES = 2*NUM_TAGS+1
NUM_FEATURES = 0
NUM_INST = 0

# load train data
with open('dev.json') as f:
    data = json.load(f)

a = [d["sent"] for d in data]
set_ = set()
for idx,s in enumerate(a):
    for t in s:
        if len(t)<1:
            set_.add(idx)
data = [data[i] for i in range(len(data)) if i not in set_]

unique_word_set = set()
for x in data:
  for w in x["sent"]:
    unique_word_set.add(w)
words_to_id = {}
for idx, w in enumerate(unique_word_set):
  words_to_id[w] = idx

NUM_FEATURES = len(unique_word_set)
NUM_INST = len(data)

print(NUM_STATES, NUM_FEATURES, NUM_INST)

227 42540 9956


In [12]:
def to_bool_vec(y_id):
    y_bool = np.zeros(2*NUM_TAGS+1, bool)
    num_labels = len(y_id)
    for id in y_id:
          y_bool[int(id)] = 1
    return y_bool

In [22]:
import numpy as np

class SentenceScanner(object):
    def __init__(self, num_states, num_features, max_len, batch_size):
        self._num_states = num_states
        self._num_features = num_features
        self._max_len = max_len
        self._batch_size = batch_size
        self._labeled_states = np.zeros((self._batch_size, self._max_len, self._num_states), dtype=np.int8)
        self._labeled_emits = np.zeros((self._batch_size, self._max_len, self._num_features), dtype=np.int8)
        # Also prepare a suffix mask to know where each sequence ended, a (B, T) tensor.
        # This will let us ignore padded positions in the loss expression.
        self._labeled_masks = np.zeros((self._batch_size, self._max_len), dtype=np.int8)
        
    def __iter__(self):
        return self

    def get_batch(self):
        # Collect instances into ndarrays declared above.
        num_sentence = 0
        for data_point in data:
            num_token = 0
            labels = label2id(data_point["tags"])
            for idx, token in enumerate(data_point["sent"]):
                xid = words_to_id[token]
                yid = to_bool_vec(labels[idx])
                self._labeled_masks[num_sentence, num_token] = 1
                self._labeled_emits[num_sentence, num_token, xid] = 1
                for x in yid:
                  if x:
                    self._labeled_states[num_sentence, num_token, x] = 1
                num_token += 1
                if num_token >= self._max_len:
                    break
            num_sentence += 1
            if num_sentence >= self._batch_size:
                yield (self._labeled_masks, self._labeled_emits, self._labeled_states)
                self._labeled_masks.fill(0)
                self._labeled_emits.fill(0)
                self._labeled_states.fill(0)
                num_sentence = 0
        if num_sentence > 0:
            yield (self._labeled_masks, self._labeled_emits, self._labeled_states)


# Clip at max sequence length T (starting with STATE_INIT).
MAX_LEN = 105
# M states, N instances (sentences), F features.
# Convert training instances into [N, T, M] states tensor and [N, T, F] emission tensor.
# We will generally not be able to hold all this in RAM, so we use batches.
BATCH_SIZE = 51

# TODO  Add code to shuffle sentences randomly and sample into train, dev, test folds.

num_sentences = 0
with tqdm(total=NUM_INST) as pbar:
    ss = SentenceScanner(NUM_STATES, NUM_FEATURES, MAX_LEN, BATCH_SIZE)
    for (_masks, _emits, _states) in ss.get_batch():
        num_sentences += _masks.shape[0]
        pbar.update(BATCH_SIZE)
print(num_sentences)

9996it [00:15, 656.94it/s]

9996





In [54]:
class ChainCRF(object):
    """Implements linear chain CRF."""
    def __init__(self, state_init, num_states, num_features, max_len, batch_size):
        self._num_states = num_states
        self._num_features = num_features
        self._max_len = max_len
        self._batch_size = batch_size
        # Trainable transition weights.  Rows = current state, columns = previous state.
        self._edgew = tf.Variable(tf.random.uniform([self._num_states, self._num_states], 
                                                     dtype=tf.float64, minval=-1., maxval=1.),
                                   trainable=True, name="edgew")   #  (M, P)
        # Trainable emission weights.  For starters we will use only lexicalized features.
        self._nodew = tf.Variable(tf.random.uniform([self._num_states, self._num_features],
                                                     dtype=tf.float64, minval=-1., maxval=1.),
                                   trainable=True, name="nodew")   #  (M, F)
        # Labeled instances.
        # Features may not be 1-hot in general. 1-hot state rep may be wasteful.
        self._masks = tf.compat.v1.placeholder(tf.float64, shape=(self._batch_size, self._max_len),
                                     name="masks")   #  (B, T)
        self._emits = tf.compat.v1.placeholder(tf.float64, shape=(self._batch_size, self._max_len,
                                                        self._num_features), name="emits")  # (B, T, F)
        self._states = tf.compat.v1.placeholder(tf.float64, shape=(self._batch_size, self._max_len,
                                                         self._num_states), name="states")  # (B, T, M)
        self._pad_states_np = np.zeros((self._batch_size, 1, self._num_states))
        self._pad_states_np[:,:, state_init] = 1
        pad_states = tf.constant(self._pad_states_np, dtype=tf.float64)
        self._prev_states = tf.concat([pad_states, self._states[:,:-1,:] ],
                                      axis=1, name="prev_states") # (B, T, P)
        # P = M but we use a distinct symbol to highlight the distinction between previous and current states.
        print(self._nodew)
        print(self._edgew)
        print(self._masks)
        print(self._emits)
        print(self._states)
        print(self._prev_states)

        # To look up w \cdot \varphi(x_t, m, p) for all instances in the batch, we need
        # corresponding tensor wvarphi_t with shape (B, T, M, P).
        # We want wvarphi_t[b, t, p, m] =  ( sum_f nodew[m, f] emits[b, t, f] ) + edgew[p, m]
        # for all possible combinations of m, p in [M] \times [P], not just the gold sequence.
        # The first term results in shape (B, T, M) and the second term results in shape (M, P).
        # These have to be expanded to shape (B, T, M, P).

        var1 = tf.einsum("btf,mf->btm", self._emits, self._nodew, name="var1")  # .... (B, T, M)
        print(var1)
        var2 = tf.expand_dims(var1, axis=3, name="var2")    #  .... (B, T, M, 1)
        print(var2)
        var3 = tf.tile(var2, [1, 1, 1, self._num_states], name="var3")   # .... (B, T, M, P)
        print(var3)

        # edge_weights is (M, P)
        var4 = tf.expand_dims(self._edgew, axis=0, name="var4")   #  (1, M, P)
        print(var4)
        var5 = tf.tile(var4, [self._max_len, 1, 1], name="var5")   #  (T, M, P)
        print(var5)
        var6 = tf.expand_dims(var5, axis=0, name="var6")   #  (1, T, M, P)
        print(var6)
        var7 = tf.tile(var6, [self._batch_size, 1, 1, 1], name="var7")   # ... (B, T, M, P)
        print(var7)

        self._wvarphi_t = tf.add(var3, var7, name="wvarphi_t")    # .... (B, T, M, P)
        print(self._wvarphi_t)
        
        # For given emissions and state labels, find score w \cdot \phi(x, y).
        self._scores_t = tf.einsum("btmp,btp,btm->bt", self._wvarphi_t,
                                   self._prev_states, self._states, name="scores_t")  #  (B,T)
        print(self._scores_t)
        self._scores = tf.reduce_sum(tf.multiply(self._scores_t, self._masks),
                                     axis=1, name="scores")    #  ... (B)
        print(self._scores)
        
        # Alpha recurrence over time steps.
        self._lalpha = tf.Variable(initial_value=np.zeros((self._batch_size, self._num_states)),
                                  trainable=True, name="lalpha_0")   # .... (B, M)
        print(self._lalpha)
        for t in range(self._max_len):
            var8 = tf.tile(tf.expand_dims(self._lalpha, axis=1), [1, self._num_states, 1])  #  (B, M, P)
            next_lalpha = tf.reduce_logsumexp(var8 + self._wvarphi_t[:,t,:,:],  # (B, M, P)
                                              axis=2, name="lalpha_"+str(t+1))
            mask_t = tf.tile(tf.expand_dims(self._masks[:,t], axis=1), [1, self._num_states])
            self._lalpha = tf.multiply(mask_t, next_lalpha) + tf.multiply(1.-mask_t, self._lalpha)
        print(self._lalpha)
        
        # For given emissions, find log Z over all possible state label sequences.
        self._logz = tf.reduce_logsumexp(self._lalpha, axis=1, name="logz")   # ... (B)
        print(self._logz)
        # We have to maximize scores - logZ i.e. minimize logZ - score.
        self._loss = tf.reduce_sum(self._logz - self._scores, name="loss")    # ... (B)
        print(self._loss)
        adamopt = tf.optimizers.Adam(learning_rate=0.1)
        self._tape = tf.GradientTape(persistent=True)
        self._train_op = adamopt.minimize(self._loss, var_list=[self._nodew, self._edgew], tape=self._tape)

        
    def check_np_scores(self, sess, masks, emitss, statess):
        """
        masks, emitss, statess are for a whole batch.
        Calculates w \cdot \phi conventionally using numpy to check correctness.
        """
        _nodew = sess.run(self._nodew)
        _edgew = sess.run(self._edgew)
        ans = np.zeros((self._batch_size))
        for b in range(self._batch_size):
            mask = masks[b,:]
            emits = emitss[b,:,:]
            states = statess[b,:,:]
            prev_states = np.concatenate((self._pad_states_np[b,:,:], states[:-1,:]), axis=0)
            potscore = 0
            for t in range(self._max_len):
                aemit = emits[t,:]
                aprev_state = prev_states[t,:]
                astate = statess[b,t,:]
                nodepot = np.matmul(astate, np.matmul(_nodew, aemit))
                edgepot = np.matmul(astate, np.matmul(_edgew, aprev_state))
                potscore += (nodepot + edgepot)
            ans[b] = potscore
        return ans
    
    def check_tf_scores(self, sess, masks, emitss, statess):
        tf_scores = sess.run(self._scores, feed_dict = {
            self._masks: masks, self._emits: emitss, self._states: statess })
        return tf_scores

    def check_np_logzs(self, sess, masks, emitss, statess):
        """
        Calculates log Z conventionally using numpy to check correctness.
        """
        np_wvarphi_t = sess.run(self._wvarphi_t, feed_dict={
            self._masks: masks, self._emits: emitss, self._states: statess})
        #print("np_wvarphi_t", np_wvarphi_t.shape)   # (B, T, M, P)
        logzs = np.zeros((self._batch_size))
        for b in range(self._batch_size):
            np_lalpha = np.zeros((self._num_states))  # (P) or (M)
            for t in range(self._max_len):
                np_lalpha_next = np.zeros((self._num_states))  # (M)
                for m in range(self._num_states):
                    softsummand = np.zeros((self._num_states))  # (P)
                    for p in range(self._num_states):
                        softsummand[p] = np_wvarphi_t[b,t,m,p] + np_lalpha[p]
                    np_lalpha_next[m] = logsumexp(softsummand)
                np_lalpha = np_lalpha_next
            logzs[b] = logsumexp(np_lalpha)
        return logzs

    def check_tf_logzs(self, sess, masks, emitss, statess):
        tf_logzs = sess.run(self._logz, feed_dict={
            self._masks: masks, self._emits: emitss, self._states: statess})
        return tf_logzs

    def do_train(self, sess, num_epochs=10):
        sess.run(tf.global_variables_initializer())
        # TODO Add code to load any partially trained model for warm-start.
        chart_batches, chart_losses = list(), list()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        #plt.ion()
        fig.show()
        fig.canvas.draw()
        num_batches = 0
        # TODO keep history of loss objectives
        for _ in range(num_epochs):
            with tqdm(total=num_sentences) as pbar:
                ss = SentenceScanner(self._num_states, self._num_features, self._max_len, self._batch_size)
                for (masks, emits, states) in ss.get_batch():
                    num_batches += 1
                    sess.run(self._train_op, feed_dict = { 
                        self._masks: masks, self._emits: emits, self._states: states })
                    _logZ = sess.run(self._logz, feed_dict = { 
                        self._masks: masks, self._emits: emits, self._states: states })
                    _scores = sess.run(self._scores, feed_dict = { 
                        self._masks: masks, self._emits: emits, self._states: states })
                    _loss = np.sum(_logZ - _scores)
                    assert _loss >= 0

                    chart_batches.append(num_batches)
                    chart_losses.append(_loss)
                    ax.clear()
                    ax.plot(chart_batches, chart_losses)
                    fig.canvas.draw()
                    pbar.update(self._batch_size)
                    pbar.set_description("%10g" % _loss)
                    
                    if np.min(_logZ - _scores) < 0:
                        print("tf_logzs - tf_scores", _logZ - _scores)
                        np_scores = self.check_np_scores(sess, masks, emits, states)
                        tf_scores = self.check_tf_scores(sess, masks, emits, states)
                        print("np_scores - tf_scores", np.linalg.norm(np_scores - tf_scores, ord=np.inf))
                        np_logzs = ccrf.check_np_logzs(sess, masks, emits, states)
                        tf_logzs = ccrf.check_tf_logzs(sess, masks, emits, states)
                        print("np_logzs - tf_logzs", np.linalg.norm(np_logzs - tf_logzs, ord=np.inf))
                        return
            # TODO Add code to decide on ending training, saving model checkpoints.
            
    def get_fold_performance():
        """TODO Add code to calculate best labels sequences for current model, compare with gold
        sequences, and return a measure of performance."""
        pass

In [55]:
tf.compat.v1.disable_eager_execution()

In [56]:
STATE_INIT = NUM_STATES-1
ccrf = ChainCRF(STATE_INIT, NUM_STATES, NUM_FEATURES, MAX_LEN, BATCH_SIZE)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    ccrf.do_train(sess)

<tf.Variable 'nodew_8:0' shape=(227, 42540) dtype=float64>
<tf.Variable 'edgew_8:0' shape=(227, 227) dtype=float64>
Tensor("masks_8:0", shape=(51, 105), dtype=float64)
Tensor("emits_8:0", shape=(51, 105, 42540), dtype=float64)
Tensor("states_8:0", shape=(51, 105, 227), dtype=float64)
Tensor("prev_states_7:0", shape=(51, 105, 227), dtype=float64)
Tensor("var1_7/Einsum:0", shape=(51, 105, 227), dtype=float64)
Tensor("var2_7:0", shape=(51, 105, 227, 1), dtype=float64)
Tensor("var3_7:0", shape=(51, 105, 227, 227), dtype=float64)
Tensor("var4_7:0", shape=(1, 227, 227), dtype=float64)
Tensor("var5_7:0", shape=(105, 227, 227), dtype=float64)
Tensor("var6_7:0", shape=(1, 105, 227, 227), dtype=float64)
Tensor("var7_7:0", shape=(51, 105, 227, 227), dtype=float64)
Tensor("wvarphi_t_7:0", shape=(51, 105, 227, 227), dtype=float64)
Tensor("scores_t_7/Einsum_1:0", shape=(51, 105), dtype=float64)
Tensor("scores_7:0", shape=(51,), dtype=float64)
<tf.Variable 'lalpha_0_7:0' shape=(51, 227) dtype=float64

ValueError: ignored