<a href="https://colab.research.google.com/github/albertwujj/gptransfer/blob/master/gptransfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from os.path import join
from google.colab import drive

ROOT = "/content/drive"
drive.mount(ROOT, force_remount=True)

PROJ = "My Drive/ColabExperiments/"
PROJECT_PATH = join(ROOT, PROJ)
%cd "{PROJECT_PATH}"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/ColabExperiments


In [0]:
from importlib.machinery import SourceFileLoader
model = SourceFileLoader('model', 'gpt-2/src/model.py').load_module()
encoder = SourceFileLoader('encoder', 'gpt-2/src/encoder.py').load_module()
sample = SourceFileLoader('sample', 'gpt-2/src/sample.py').load_module()

In [0]:
import tensorflow as tf
import numpy as np

from itertools import islice
import json
import os
import random



In [0]:
from model import block, shape_list, positions_for, norm, default_hparams



def custom_model(hparams, X, past=None, scope='model', reuse=False):

    with tf.variable_scope(scope, reuse=reuse):
        results = {}
        batch, sequence = shape_list(X)

        wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.01))
        wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.02))
        past_length = 0 if past is None else tf.shape(past)[-2]
        h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
        
        # Transformer
        presents = []
        pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
        assert len(pasts) == hparams.n_layer
        for layer, past in enumerate(islice(pasts, hparams.n_layer)):
            h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
            presents.append(present)
        results['present'] = tf.stack(presents, axis=1)
        h = norm(h, 'ln_f')

        # Language model loss.  Do tokens <n predict token n?
        h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
        logits = tf.matmul(h_flat, wte, transpose_b=True)
        logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
        results['logits'] = logits

        results['h'] = h
        return results
 
def small_model(hparams, X, **kwargs):
    hparams.n_layer = 5
    return custom_model(hparams, X, **kwargs)
    

In [0]:
batch_size = 4

def read_amazon_kaggle(filename, lines=None):
    
    with open(filename) as f:
        label_x = [(1 if line[9] == '2' else -1, line[10:].strip()) for line in f]
        label_x = random.sample(label_x, lines)
        label, x = zip(*label_x)
        return label, x
      

labels, dataX = read_amazon_kaggle('gptransfer/data/amazonreviews/train.ft.txt', 100000)


size = (len(labels) // batch_size) * batch_size
labels = np.asarray([[elt] for elt in labels[:size]])




In [0]:
from encoder import Encoder
def get_encoder(model_name):
    with open(os.path.join('gpt-2/models', model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    with open(os.path.join('gpt-2/models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

def encode(dataX, model_name='117M'):
    enc = get_encoder(model_name)
    codeX = []
    for i, x in enumerate(dataX):
        vec = enc.encode(x)
        codeX.append(vec)
        
    return sorted(codeX, key=len)


dataX = encode(dataX)


In [0]:
import gc
seed=None

def load_model(model_name='117M'):

    with open(os.path.join('gpt-2/models', model_name, 'hparams.json')) as f:
        hparams = model.default_hparams()
        hparams.override_from_dict(json.load(f))   
        def step(hparams, tokens, past=None):
            lm_output = small_model(hparams=hparams, X=tokens, past=None, reuse=tf.AUTO_REUSE)

            h = lm_output['h']
            return {
                'h': h
            }
        
        X = tf.placeholder(tf.int32, [batch_size, None])
        outputs = step(hparams, X) # (batch, sequence, embedding)
        
        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(os.path.join('gpt-2/models', model_name))
        saver.restore(sess, ckpt)
        
    return X, outputs

def add_binary_finetune(outputs):
    with tf.variable_scope('binary_finetune'):
        
        final_embd = tf.math.l2_normalize(outputs['h'][:,-1,:], axis=-1)
        w = tf.get_variable('w', [final_embd.shape[-1],1],initializer=tf.contrib.layers.xavier_initializer(uniform=False))
        b = tf.get_variable('b', [final_embd.shape[0],1], initializer=tf.constant_initializer(0))
        sess.run(tf.variables_initializer([w,b]))
        ypred = tf.tanh(tf.matmul(final_embd,w, name='z') + b)
        
        ytrue = tf.placeholder(tf.float32, [batch_size, 1])
        incorrects = tf.not_equal(tf.sign(ytrue), tf.sign(ypred), name='incorrects')
        incorrects = tf.stop_gradient(tf.cast(incorrects, tf.float32))
        
        loss = tf.math.reduce_mean(tf.math.square((ypred - ytrue) * incorrects), name='loss')
        
        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = 0.01
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           1000, 0.98, staircase=True)
        optim = tf.train.AdamOptimizer(learning_rate=.0003)
        minimize = optim.minimize(loss, global_step=global_step)
        sess.run(tf.variables_initializer(optim.variables() + [global_step]))
        
    return ytrue, ypred, minimize, optim
    
from keras.preprocessing.sequence import pad_sequences

run_options = tf.RunOptions(report_tensor_allocations_upon_oom = True)


def get_train_infer(X, outputs, ytrue, ypred, minimize):
    wt = sess.graph.get_tensor_by_name('binary_finetune/w:0')
    zt = zt = sess.graph.get_tensor_by_name('binary_finetune/z:0')
    losst = sess.graph.get_tensor_by_name('binary_finetune/loss:0')
    incorrectst = sess.graph.get_tensor_by_name('binary_finetune/incorrects:0')
    def train(dataX, labels):
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            yfeed = labels[i:i+batch_size]
            _, loss, pd, w, z, incorrects = sess.run([minimize,losst, ypred, wt, zt, incorrectst], 
                          options=run_options, feed_dict={X:xfeed, ytrue:yfeed})
            if (i//batch_size) % 1000 == 0:
                gc.collect()
 

    def infer(dataX):
        preds = []
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            pd, w, z = sess.run([ypred, wt, zt], options=run_options, feed_dict={X: xfeed})
            preds.append(pd)
            if (i//batch_size) % 1000 == 0:
                gc.collect()
                print()
                
        preds = np.concatenate(preds, axis=0)
        return preds

    return train, infer


Using TensorFlow backend.


In [0]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(graph=tf.Graph(),config=config)
sess.__enter__()
X, outputs = load_model()
ytrue, ypred, minimize, optim = add_binary_finetune(outputs)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from gpt-2/models/117M/model.ckpt
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [0]:
from sklearn.metrics import precision_recall_fscore_support

out = open('gptransfer/results/out.txt', 'w')
printout = lambda w: out.write(w + '\n')

def evaluate(pred, labels):
    _, _, f1, _ = precision_recall_fscore_support(labels, pred)
    return f1


train, infer = get_train_infer(X, outputs, ytrue, ypred, minimize)

eval_batchsize = 10000
for i in range(0, 100000, eval_batchsize):
    xbatch = dataX[i:i+eval_batchsize]
    ybatch = labels[i:i+eval_batchsize]
    train(xbatch, ybatch)
    predbatch = np.sign(infer(xbatch))

    f1 = evaluate(predbatch, ybatch)
    print(f'f1 score: {f1}')




f1 score: [0.00401365 0.66950789]
