<a href="https://colab.research.google.com/github/albertwujj/gptransfer/blob/master/gptransfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from os.path import join
from google.colab import drive

ROOT = "/content/drive"
drive.mount(ROOT, force_remount=True)

PROJ = "My Drive/ColabExperiments/"
PROJECT_PATH = join(ROOT, PROJ)
%cd "{PROJECT_PATH}"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/ColabExperiments


In [0]:
!ls

gpt-2  gptransfer


In [0]:
from importlib.machinery import SourceFileLoader
model = SourceFileLoader('model', 'gpt-2/src/model.py').load_module()
encoder = SourceFileLoader('encoder', 'gpt-2/src/encoder.py').load_module()
sample = SourceFileLoader('sample', 'gpt-2/src/sample.py').load_module()

In [0]:
import tensorflow as tf
import numpy as np
import gc

from itertools import islice
import json
import os
import random
from timeit import default_timer as timer

seed = 1957
random.seed(seed)
np.random.seed(seed)
tf.set_random_seed(seed)

In [0]:
from model import block, shape_list, positions_for, norm, default_hparams

def custom_model(hparams, X, past=None, scope='model', reuse=False):

    with tf.variable_scope(scope, reuse=reuse):
        results = {}
        batch, sequence = shape_list(X)

        wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.01))
        wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.02))
        past_length = 0 if past is None else tf.shape(past)[-2]
        h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
        
        # Transformer
        presents = []
        pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
        assert len(pasts) == hparams.n_layer
        for layer, past in enumerate(islice(pasts, hparams.n_layer)):
            h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
            presents.append(present)
        results['present'] = tf.stack(presents, axis=1)
        h = norm(h, 'ln_f')

        # Language model loss.  Do tokens <n predict token n?
        h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
        logits = tf.matmul(h_flat, wte, transpose_b=True)
        logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
        results['logits'] = logits

        results['h'] = h
        return results
 
def small_model(hparams, X, **kwargs):
    hparams.n_layer = 5
    return custom_model(hparams, X, **kwargs)
    

In [0]:

from sklearn.model_selection import train_test_split
from encoder import Encoder

def read_amazon_kaggle(filename, lines=None):
    with open(filename) as f:
        y_x = [(1 if line[9] == '2' else -1, line[10:].strip()) for line in f]
        if lines:
            y_x = random.sample(y_x, lines)
        y, x = zip(*y_x)
        return x, y

def get_encoder(model_name):
    with open(os.path.join('gpt-2/models', model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    with open(os.path.join('gpt-2/models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

def encode(x, y, model_name='117M'):
    enc = get_encoder(model_name)
    xcode = []
    for i, x in enumerate(x):
        vec = enc.encode(x)
        xcode.append(vec)
    x = xcode
    return zip(*sorted(zip(x, y), key = lambda t: len(t[0])))
      

x_all, y_all = read_amazon_kaggle('gptransfer/data/amazonreviews/train.ft.txt', 100000)
x_all, y_all = encode(x_all, y_all)
x_all, y_all = (np.asarray(x_all), np.asarray(y_all))

x_train, x_val, y_train, y_val = train_test_split(x_all, y_all, test_size=0.04,random_state=seed)


In [0]:
from keras.preprocessing.sequence import pad_sequences
run_options = tf.RunOptions(report_tensor_allocations_upon_oom = True)

batch_size = 8

def load_model(model_name='117M'):

    with open(os.path.join('gpt-2/models', model_name, 'hparams.json')) as f:
        hparams = model.default_hparams()
        hparams.override_from_dict(json.load(f))   
        def step(hparams, tokens, past=None):
            lm_output = custom_model(hparams=hparams, X=tokens, past=None, reuse=tf.AUTO_REUSE)

            h = lm_output['h']
            return {
                'h': h
            }
        
        X = tf.placeholder(tf.int32, [batch_size, None])
        outputs = step(hparams, X) # (batch, sequence, embedding)
        
        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(os.path.join('gpt-2/models', model_name))
        saver.restore(sess, ckpt)
        
    return X, outputs

def add_binary_finetune(outputs):
    with tf.variable_scope('binary_finetune'):
        
        final_embd = tf.math.l2_normalize(outputs['h'][:,-1,:], axis=-1)
        w = tf.get_variable('w', (final_embd.shape[-1],), initializer=tf.contrib.layers.xavier_initializer(uniform=False))
        b = tf.get_variable('b', (final_embd.shape[0],), initializer=tf.constant_initializer(0))
        sess.run(tf.variables_initializer([w,b]))
        ypred = tf.tanh(tf.tensordot(final_embd,w, [[1],[0]], name='z') + b)
        
        ytrue = tf.placeholder(tf.float32, (batch_size,))
        incorrects = tf.not_equal(tf.sign(ytrue), tf.sign(ypred), name='incorrects')
        incorrects = tf.stop_gradient(tf.cast(incorrects, tf.float32))
        
        loss = tf.math.reduce_mean(tf.math.square((ypred - ytrue) * incorrects), name='loss')
        
        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = 0.0003
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           500, 0.94, staircase=False)
        optim = tf.train.AdamOptimizer(learning_rate=learning_rate)
        minimize = optim.minimize(loss, global_step=global_step)
        sess.run(tf.variables_initializer(optim.variables() + [global_step]))
        
    return ytrue, ypred, minimize, optim


def get_train_infer(X, outputs, ytrue, ypred, minimize):
    
    def train(dataX, labels):
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            yfeed = labels[i:i+batch_size]
            sess.run(minimize, options=run_options, feed_dict={X:xfeed, ytrue:yfeed})

    def infer(dataX):
        preds = []
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            predbatch = sess.run(ypred, options=run_options, feed_dict={X: xfeed})
            preds.append(predbatch)
                
        preds = np.concatenate(preds, axis=0)
        return preds

    return train, infer


In [0]:
from sklearn.metrics import roc_auc_score
from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import MaxBytesInUse

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

with tf.device('/device:GPU:0'):
    with tf.Session(graph=tf.Graph(),config=config) as sess:
        max_bytes_in_use = MaxBytesInUse()

        X, outputs = load_model()
        ytrue, ypred, minimize, optim = add_binary_finetune(outputs)
        train, infer = get_train_infer(X, outputs, ytrue, ypred, minimize)
           
        eval_batchsize = 500 * batch_size
        start = timer()
        with open('gptransfer/results/out', 'w', buffering=1) as out:
            for i in range(0, x_train.shape[0], eval_batchsize):

                end = min(x_train.shape[0], i+eval_batchsize)
                x_train_b = x_train[i:end]
                y_train_b = y_train[i:end]
                train(x_train_b, y_train_b)

                pred_val = infer(x_val)
                roc_score = roc_auc_score(y_val, pred_val)
                corrects = np.count_nonzero(y_val == np.sign(pred_val)) / y_val.shape[0]

                gb_used = sess.run(max_bytes_in_use) / 1e9
                print(f'roc score: {roc_score}')
                print(f'corrects: {corrects}')

                printout = lambda w: out.write(w + '\n')
                printout(f'roc score {i//eval_batchsize}: {roc_score:.3f}')
                printout(f'corrects {i//eval_batchsize}: {corrects:.3f}')
                printout(f'GPU GB used {i//eval_batchsize}: {gb_used:.2f}')

                print(f'GPU GB used {i//eval_batchsize}: {gb_used:.2f}')
                print(f'elapsed minutes {i//eval_batchsize}: {(timer() - start)//60}')
                
                %cp gptransfer/results/out gptransfer/results/out_backup
 

            