<a href="https://colab.research.google.com/github/albertwujj/gptransfer/blob/master/gptransfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from os.path import join
from google.colab import drive

ROOT = "/content/drive"
drive.mount(ROOT, force_remount=True)

PROJ = "My Drive/ColabExperiments/"
PROJECT_PATH = join(ROOT, PROJ)
%cd "{PROJECT_PATH}"

In [0]:
from importlib.machinery import SourceFileLoader
model = SourceFileLoader('model', 'gpt-2/src/model.py').load_module()
encoder = SourceFileLoader('encoder', 'gpt-2/src/encoder.py').load_module()
sample = SourceFileLoader('sample', 'gpt-2/src/sample.py').load_module()

In [0]:
import tensorflow as tf
import numpy as np
import sys
np.set_printoptions(precision=3,threshold=sys.maxsize)
import gc

from itertools import islice
import json
import os
import random
from timeit import default_timer as timer

seed = 1957
random.seed(seed)
np.random.seed(seed)
tf.set_random_seed(seed)

In [0]:
# Prepare data cell

batch_size = 20

from sklearn.model_selection import train_test_split
from encoder import Encoder

def read_amazon_kaggle(filename, lines=None):
    with open(filename) as f:
        y_x = [(1 if line[9] == '2' else 0, line[10:].strip()) for line in f]
        if lines:
            y_x = random.sample(y_x, lines)
        y, x = zip(*y_x)
        return x, y

def get_encoder(model_name):
    with open(os.path.join('gpt-2/models', model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    with open(os.path.join('gpt-2/models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

def encode(x, y, model_name='117M'):
    enc = get_encoder(model_name)
    xcode = []
    for i, x in enumerate(x):
        vec = enc.encode(x)
        xcode.append(vec)
    x = xcode
    return zip(*sorted(zip(x, y), key = lambda t: len(t[0])))
      
def get_data(filename, total=None):
    x_all, y_all = read_amazon_kaggle(filename, total)
    x_all, y_all = encode(x_all, y_all)
    x_all, y_all = (np.asarray(x_all), np.asarray(y_all))
    return x_all, y_all

def mod_by_batchsize(x):
    l = (len(x) // batch_size) * batch_size
    x = x[:l]
    return x
    
test_size = 50000
train_size = 20000
val_size = 2500
train_val_size = train_size + val_size

x_all, y_all = get_data('gptransfer/data/amazonreviews/train.ft.txt', test_size+train_size+val_size)

x_train, x_test, y_train, y_test =  train_test_split(x_all, y_all, test_size=test_size,random_state=seed)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_size,random_state=seed)

x_train, x_val, x_test, y_train, y_val, y_test = tuple(mod_by_batchsize(x) for x in (x_train, x_val, x_test, y_train, y_val, y_test))


In [0]:
# Hyper Param definition cell
frozen_layers = 8 # how many GPT2 layers to not train (out of 12)

In [0]:
# This cell contains all code copied from GPT2. Original GPT2 model with slight modifications.

from model import shape_list, positions_for, norm, default_hparams, attn, gelu, conv1d

dropout_prob = None
def mlp(x, layer_num, scope, n_state, *, hparams):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        h = gelu(conv1d(x, 'c_fc', n_state))
        h2 = conv1d(h, 'c_proj', nx)
        if layer_num > frozen_layers:
            h2 = tf.nn.dropout(h2, rate=dropout_prob)
        return h2
    
def block(x, layer_num, scope, *, past, hparams):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
        x = x + a
        m = mlp(norm(x, 'ln_2'), layer_num, 'mlp', nx*4, hparams=hparams)
        x = x + m
        return x, present
    
def custom_model(hparams, X, past=None, scope='model', reuse=False):
    global dropout_prob
    dropout_prob = tf.placeholder_with_default(0.0, shape=())
    with tf.variable_scope(scope, reuse=reuse):
        results = {}
        batch, sequence = shape_list(X)

        wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.01))
        wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.02))
        past_length = 0 if past is None else tf.shape(past)[-2]
        h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
        
        # Transformer
        presents = []
        pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
        assert len(pasts) == hparams.n_layer
        for layer, past in enumerate(islice(pasts, hparams.n_layer)):
            h, present = block(h, layer, 'h%d' % layer, past=past, hparams=hparams)
            if layer == frozen_layers:
                h = tf.stop_gradient(h)
            presents.append(present)
        results['present'] = tf.stack(presents, axis=1)
        h = norm(h, 'ln_f')

        # Language model loss.  Do tokens <n predict token n?
        h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
        logits = tf.matmul(h_flat, wte, transpose_b=True)
        logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
        results['logits'] = logits

        results['h'] = h
        return results
 
def small_model(hparams, X, **kwargs):
    hparams.n_layer = 5
    return custom_model(hparams, X, **kwargs)
    

In [0]:
from keras.preprocessing.sequence import pad_sequences
run_options = tf.RunOptions(report_tensor_allocations_upon_oom = True)

def load_model(model_name='117M'):

    with open(os.path.join('gpt-2/models', model_name, 'hparams.json')) as f:
        hparams = model.default_hparams()
        hparams.override_from_dict(json.load(f))   
        def step(hparams, tokens, past=None):
            lm_output = custom_model(hparams=hparams, X=tokens, past=None, reuse=tf.AUTO_REUSE)

            h = lm_output['h']
            return {
                'h': h
            }
        
        X = tf.placeholder(tf.int32, [batch_size, None])
        outputs = step(hparams, X) # (batch, sequence, embedding)
        
        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(os.path.join('gpt-2/models', model_name))
        saver.restore(sess, ckpt)
        
    return X, outputs

def add_binary_finetune(outputs):
    with tf.variable_scope('binary_finetune'):
        
        final_embd = outputs['h'][:,-1,:]
        w = tf.get_variable('w', (final_embd.shape[-1],), initializer=tf.contrib.layers.xavier_initializer(uniform=False))
        b = tf.get_variable('b', (final_embd.shape[0],), initializer=tf.constant_initializer(0))
        sess.run(tf.variables_initializer([w,b]))
        l2loss = tf.nn.l2_loss(w)
        logits = tf.tensordot(final_embd,w, [[1],[0]], name='z') + b
        ypred = tf.tanh(logits)
        
        ytrue = tf.placeholder(tf.float32, (batch_size,))
        incorrects = tf.not_equal(tf.sign(ytrue), tf.sign(ypred), name='incorrects')
        incorrects = tf.stop_gradient(tf.cast(incorrects, tf.float32))
        
       
        loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=ytrue, logits=logits) + l2loss * .01
        
        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = 0.001
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           10000, 0.95, staircase=False)
        optim = tf.train.AdamOptimizer(learning_rate=learning_rate)
        minimize = optim.minimize(loss, global_step=global_step)
        sess.run(tf.variables_initializer(optim.variables() + [global_step]))
        
    return ytrue, ypred, minimize, optim


def get_train_infer(X, outputs, ytrue, ypred, minimize):
    
    def train(dataX, labels):
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            yfeed = labels[i:i+batch_size]
            sess.run(minimize, options=run_options, feed_dict={X:xfeed, ytrue:yfeed, dropout_prob: 0.5})

    def infer(dataX):
        preds = []
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            predbatch = sess.run(ypred, options=run_options, feed_dict={X: xfeed})
            preds.append(predbatch)
                
        preds = np.concatenate(preds, axis=0)
        return preds

    return train, infer


Using TensorFlow backend.


In [0]:
print(np.count_nonzero(np.sign(y_test - .5) == np.sign(pred_test - .5)) / y_test.shape[0])

0.9246
