<a href="https://colab.research.google.com/github/albertwujj/gptransfer/blob/master/gpshare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# MOUNT GOOGLE DRIVE
# creates a new folder called 'ColabExperiments1921'

# I put the final embedding output through a fully-connected layer and train on binary cross-entropy,
# and freeze the earlier layers of the decoder and add regularization.

import os
from os.path import join
from google.colab import drive

%cd /

ROOT = '/content/drive1921/'
if not os.path.exists(ROOT):
    drive.mount(ROOT)
%cd '{ROOT}'

PROJ = 'My Drive/ColabExperiments1921/'
if not os.path.exists(PROJ):
    !mkdir '{PROJ}'
%cd '{PROJ}'


In [0]:
# Download data+code
if not os.path.exists('lmtransfer'):
    !gdown https://drive.google.com/uc?id=1VxxPnBOLh7a0kl6XCUFundH4yD-McWRu
    !unzip gpshare.zip
    !rm gpshare.zip
%cd lmtransfer


In [0]:
# Setup code modules and path
if not os.path.exists('src'):
    !mkdir src
%cd src

from importlib.machinery import SourceFileLoader
path = '../input/'
model = SourceFileLoader('model', path+'gpt-2/src/model.py').load_module()
encoder = SourceFileLoader('encoder', path+'gpt-2/src/encoder.py').load_module()
sample = SourceFileLoader('sample', path+'gpt-2/src/sample.py').load_module()


In [0]:
# import libraries

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import sys
from sklearn import metrics
np.set_printoptions(precision=3,threshold=sys.maxsize)
import gc

from itertools import islice
import json
import random
from timeit import default_timer as timer



seed = None
random.seed(1957)
np.random.seed(1957)
tf.set_random_seed(1957)

In [0]:
# Prepare data

batch_size = 10
clf_token = None
from encoder import Encoder


def read_amazon_kaggle(lines=None, test=False):
    dataset = 'test' if test else 'train'
    with open(f'../input/amazonreviews/{dataset}.ft.txt') as f:
        y_x = [(1 if line[9] == '2' else 0, line[10:].strip()) for line in f]
        if lines:
            y_x = random.sample(y_x, lines)
        y, x = zip(*y_x)
        return x, y, np.ones((len(x),)), np.zeros((len(y),1)) # last 2 for compatability with other dataset I'm testing on

def get_encoder(model_name):
    global clf_token
    with open(os.path.join(path, 'gpt-2/models', model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    with open(os.path.join(path, 'gpt-2/models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

def encode(datapack, model_name='117M'):
    enc = get_encoder(model_name)
    xcode = []
    for i, x in enumerate(datapack[0]):
        vec = enc.encode(x) + [enc.encoder['________________________________________________________________']]
        xcode.append(vec)
    datapack = (np.asarray(xcode),) + datapack[1:]
    return [np.asarray(x) for x in zip(*sorted(zip(*datapack), key = lambda t: len(t[0])))]
      

def mod_by_batchsize(x):
    l = (len(x) // batch_size) * batch_size
    x = x[:l]
    return x


def read_test(lines=None):
    test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
    df_submit = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv')
    if lines:
        test = test.sample(lines)
        df_submit = df_submit.sample(lines)
    x, inds = encode(([x.strip('\"') for x in test['comment_text'].tolist()],test['id'].tolist()))
    return x, inds, df_submit    


lines = 30000
datapack = read_amazon_kaggle(lines=lines)
x_all, y_all, weights, identities = encode(datapack)
x_test, y_test = encode(read_amazon_kaggle(test=True)[:2])
print(len(x_test))
weights /= np.mean(weights)


In [0]:
# Global Hyper Param definition 

frozen_blocks = None
lr = None

In [0]:
# Setup OpenAI GPT-2 for training (add dropout)

from model import *
import model

is_train = None
curr_layer = None
sess = None

def dropout(x, rate=.1, noise_shape=None):
    return tf.nn.dropout(x, rate=rate*is_train, noise_shape=noise_shape)

def attn(x, scope, n_state, *, past, hparams):
    assert x.shape.ndims == 3  # Should be [batch, sequence, features]
    assert n_state % hparams.n_head == 0
    if past is not None:
        assert past.shape.ndims == 5  # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]

    def split_heads(x):
        # From [batch, sequence, features] to [batch, heads, sequence, features]
        return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])

    def merge_heads(x):
        # Reverse of split_heads
        return merge_states(tf.transpose(x, [0, 2, 1, 3]))

    def mask_attn_weights(w):
        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
        _, _, nd, ns = shape_list(w)
        b = attention_mask(nd, ns, dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w*b - tf.cast(1e10, w.dtype)*(1-b)
        return w

    def multihead_attn(q, k, v):
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))

        w = mask_attn_weights(w)
        w = softmax(w)
        if curr_layer not in frozen_blocks:
            w = dropout(w)
        a = tf.matmul(w, v)
        return a

    with tf.variable_scope(scope):
        c = conv1d(x, 'c_attn', n_state*3)
        q, k, v = map(split_heads, tf.split(c, 3, axis=2))
        present = tf.stack([k, v], axis=1)
        if past is not None:
            pk, pv = tf.unstack(past, axis=1)
            k = tf.concat([pk, k], axis=-2)
            v = tf.concat([pv, v], axis=-2)
        a = multihead_attn(q, k, v)
        a = merge_heads(a)
        a = conv1d(a, 'c_proj', n_state)
        if curr_layer not in frozen_blocks:
            a = dropout(a)
        return a, present


def mlp(x, scope, n_state, *, hparams):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        h = gelu(conv1d(x, 'c_fc', n_state))
        h2 = conv1d(h, 'c_proj', nx)
        if curr_layer not in frozen_blocks:
            h2 = dropout(h2)
        return h2
    
model.mlp = mlp
model.attn = attn



frozen_layer = 5
def custom_model(hparams, X, past=None, scope='model', reuse=False):
    global is_train, curr_layer
    is_train = tf.placeholder_with_default(0.0, shape=())
    with tf.variable_scope(scope, reuse=reuse):
        results = {}
        batch, sequence = shape_list(X)

        wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.01))
        wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.02))
        past_length = 0 if past is None else tf.shape(past)[-2]
        h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
        
        # Transformer
        presents = []
        pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
        assert len(pasts) == hparams.n_layer
        for layer, past in enumerate(islice(pasts, hparams.n_layer)):
            curr_layer = layer
            if curr_layer >= frozen_layer: 
                h = tf.stop_gradient(h)
            h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)

            presents.append(present)
        results['present'] = tf.stack(presents, axis=1)
        h = norm(h, 'ln_f')

        # Language model loss.  Do tokens <n predict token n?
        h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
        logits = tf.matmul(h_flat, wte, transpose_b=True)
        logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
        results['logits'] = logits

        results['h'] = h
        
        results['wte'] = wte
        return results
 
def small_model(hparams, X, **kwargs):
    hparams.n_layer = 5
    return custom_model(hparams, X, **kwargs)

    

In [0]:
# Add fine-tuning

from keras.preprocessing.sequence import pad_sequences

hparams = None
def load_model(model_name='117M'):
    global hparams
    with open(os.path.join(path, 'gpt-2/models', model_name, 'hparams.json')) as f:
        hparams = model.default_hparams()
        hparams.override_from_dict(json.load(f))   
        def step(hparams, tokens, past=None):
            lm_output = custom_model(hparams=hparams, X=tokens, past=None, reuse=tf.AUTO_REUSE)
            return lm_output
        
        X = tf.placeholder(tf.int32, [None, None])
        outputs = step(hparams, X) # (batch, sequence, embedding)
        
        saver = tf.train.Saver(var_list=[v for v in tf.trainable_variables() if 'lm_h' not in v.name])
        ckpt = tf.train.latest_checkpoint(os.path.join(path, 'gpt-2/models', model_name))
        saver.restore(sess, ckpt)
        
    return X, outputs
weights_t = None

lstm_o1 = tf.keras.layers.CuDNNLSTM(500,time_major=False,return_sequences=True)


def add_binary_finetune(X, outputs):
    global weights_t
    with tf.variable_scope('binary_finetune'):
            
       
        h = outputs['h']
        wte = outputs['wte']
        
        # also train on LM objective
        lm_h = outputs['h']
        lm_h = tf.reshape(lm_h[:, :-1, :], [-1, hparams.n_embd])
        lm_logits = tf.matmul(lm_h, wte, transpose_b=True)
        lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lm_logits, labels=tf.reshape(X[:, 1:], [-1]))
        lm_losses = tf.reduce_sum(tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1]-1]), 1)
        
        h = lstm_o1.apply(h)
        h = dropout(h)
        final_embd = h[:,-1,:] # (batch, n_embd)
    
        w = tf.get_variable('w', (final_embd.shape[-1],), initializer=tf.contrib.layers.xavier_initializer(uniform=False))
        b = tf.get_variable('b', (1,), initializer=tf.constant_initializer(0))


        
        l2loss = tf.nn.l2_loss(w)
        logits = tf.tensordot(final_embd,w, [[1],[0]], name='z') + b
        ypred = tf.nn.sigmoid(logits)
        
        ytrue = tf.placeholder(tf.float32, (None,))
        
        weights_t = tf.placeholder(tf.float32, (None,))
        loss = tf.reduce_mean(.3 * lm_losses + weights_t * (tf.nn.sigmoid_cross_entropy_with_logits(labels=ytrue, logits=logits) + l2loss * .02))
        
        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = lr
        learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
                                           x_all.shape[0]*2//(batch_size), end_learning_rate=0.3,cycle=True)
        optim = tf.train.AdamOptimizer(learning_rate=lr)
        
        
    return ytrue, ypred, optim, loss

def get_train_infer(X, outputs, ytrue, ypred, minimize):
    def train(dataX, labels, weights_b):
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            yfeed = labels[i:i+batch_size]
            weights_feed = weights_b[i:i+batch_size]
           
            sess.run(minimize, feed_dict={X:xfeed, ytrue:yfeed, weights_t:weights_feed, is_train: 1})

    def infer(dataX):
        preds = []
        for i in range(0, len(dataX), batch_size):
            xfeed = pad_sequences(dataX[i:i+batch_size])
            predbatch = sess.run(ypred, feed_dict={X: xfeed})
            preds.append(predbatch)
                
        preds = np.concatenate(preds, axis=0)
        return preds

    return train, infer
from numba import cuda

In [0]:
from numba import cuda
# Training and testing

def run_infer(space):
    global sess, lr, frozen_blocks
    lr = space['lr']
    frozen_blocks = space['frozen_blocks']
    epochs = 2

    with tf.device('/device:GPU:0') as dev, tf.Session(graph=tf.get_default_graph(),
                                                   config=tf.ConfigProto(allow_soft_placement=True)) as sessl:
        sess = sessl
        X, outputs = load_model()
        ytrue, ypred, optim, loss = add_binary_finetune(X, outputs)

        train_vars = tf.trainable_variables()
        train_cond = lambda v: v.name.split('/')[0] == 'binary_finetune' or (v.name[:7] == 'model/h' and int(v.name[7] not in frozen_blocks))    
        train_vars = [v for v in train_vars if train_cond(v)]
        print([v.name for v in train_vars])
        minimize = optim.minimize(loss, var_list = train_vars)

        need_init = set(s.decode("utf-8") for s in sess.run(tf.report_uninitialized_variables()))
        print(need_init)
        init = tf.variables_initializer([v for v in tf.global_variables() if v.name.split(':')[0] in need_init])
        sess.run(init)

        train, infer = get_train_infer(X, outputs, ytrue, ypred, minimize)

        for epoch in range(epochs):
            start = timer()
            train(x_all, y_all, weights)
            print(f'time elapsed for epoch {epoch}: {timer()-start}')
            
        pred_test = infer(x_test)
        accuracy = np.count_nonzero(np.sign(y_test - .5) == np.sign(pred_test - .5)) / y_test.shape[0]
        print(f'Testing accuracy: {accuracy}')
        cuda.select_device(0)
        cuda.close()

run_infer({'lr': .0003, 'frozen_blocks':list(range(frozen_layer)) + []})




In [0]:
# Training and validation, spawning new processes to clear GPU mem

from sklearn.metrics import roc_auc_score, roc_curve
from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import MaxBytesInUse
from sklearn.model_selection import KFold as KFold
from itertools import product
import multiprocess as mp
config = tf.ConfigProto()
config.gpu_options.allow_growth = True


import pickle
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

from functools import reduce, partial
import sklearn

def loadTrials(trials_name): 
    trials = pickle.load(open(trials_name, "rb"))
    # delete all 'running' jobs- jobs stopped in execution
    trials._dynamic_trials = [t for t in trials._dynamic_trials if t['state'] != 1 and t['state'] != 3]
    for i, t in enumerate(trials._dynamic_trials):
        t['tid'] = i
    trials._ids = set()
    trials.refresh()
    return trials

epochs = 2
                        
def val(): 
    
    with open('out', 'w') as out:
        printout = lambda w: out.write(w+'\n')
        def objective(space):

            global lr, frozen_blocks
            lr = space['lr']
            frozen_blocks = space['frozen_blocks']
            hparams_string = reduce(lambda x,y:x+y,[f'{k} {v} ' for k, v in space.items()])
            print(hparams_string)
            printout(hparams_string)

            kf = KFold(n_splits=3,random_state=None, shuffle=True)
            scores = []
            for train_index, val_index in islice(kf.split(x_all, y_all), 2):
                x_train, weights_train, _, y_train, x_val, _, idens_val, y_val = [mod_by_batchsize(arr[inds]) for inds, arr in product([train_index, val_index], [x_all,weights,identities,y_all])]

                def run_fold(acc):
                    global sess
                    with tf.device('/device:GPU:0') as dev, tf.Session(graph=tf.get_default_graph(),
                                                                   config=tf.ConfigProto(allow_soft_placement=True)) as sess1:

                        sess = sess1
                        max_bytes_in_use = MaxBytesInUse()

                        X, outputs = load_model()
                        ytrue, ypred, optim, loss = add_binary_finetune(X, outputs)

                        train_vars = tf.trainable_variables()
                        train_cond = lambda v: v.name.split('/')[0] == 'binary_finetune' or (v.name[:7] == 'model/h' and int(v.name[7] not in frozen_blocks))    
                        train_vars = [v for v in train_vars if train_cond(v)]
                        print([v.name for v in train_vars])
                        minimize = optim.minimize(loss, var_list = train_vars)
                        
                        need_init = set(s.decode("utf-8") for s in sess.run(tf.report_uninitialized_variables()))
                        print(need_init)
                        init = tf.variables_initializer([v for v in tf.global_variables() if v.name.split(':')[0] in need_init])
                        sess.run(init)
                        
                        train, infer = get_train_infer(X, outputs, ytrue, ypred, minimize)

                        for epoch in range(epochs):
                            eval_batchsize = 500000
                            eval_batchsize = (eval_batchsize//batch_size) * batch_size
                            start = timer()
                            
                            for i in range(0, x_train.shape[0], eval_batchsize):
                                end = min(x_train.shape[0], i+eval_batchsize)
                                x_train_b = x_train[i:end]
                                y_train_b = y_train[i:end]
                                weights_train_b = weights_train[i:end]
                                train(x_train_b, y_train_b, weights_train_b)

                        pred_val = infer(x_val)
                        accuracy = np.count_nonzero(np.sign(y_val - .5) == 
                                                    np.sign(pred_val - .5)) / y_val.shape[0]

                        gb_used = sess.run(max_bytes_in_use) / 1e9
                        print(f'GPU GB used: {gb_used:.2f}')
                        print(f'elapsed minutes: {(timer() - start)//60}')

          
                        print('Fold Complete')
                        print(f'Score: {accuracy}')
                        printout(f'Score: {accuracy}')
                        acc.value = accuracy
                        cuda.select_device(0)
                        cuda.close()

                acc = mp.Value('f',0.0)
                p = mp.Process(target=run_fold, args=(acc,))

                p.start()
                p.join()
                if 0 and acc.value < .3: # too bad, don't do next
                    return {'loss': -acc.value, 'status': STATUS_OK}
                scores.append(acc.value)

            avg_score = sum(scores) / len(scores)
            print(f'Average score with {hparams_string}: {avg_score}\n\n')
            %cp out backupout
            return {'loss': -(avg_score), 'status': STATUS_OK}



        def random_search():
            best_acc = None
            frozen_blocks = set(list(range(3)))
            start_lr = .0002
            end_lr = .0006
            for i in range(30):
                num_frozen = random.randint(1,3)
                frozen_blocks = frozen_blocks.union(random.sample(range(3, 8), 1))
                lr = np.exp(random.uniform(np.log(start_lr), np.log(end_lr))).astype(np.float32)
                space = {'lr': .0004, 'frozen_blocks':list(range(frozen_layer)) + []}
                results = objective(space)

                if best_acc is None or -results['loss'] > best_acc:
                    best_acc = -results['loss']
            print(f'BEST score: {best_acc}')
        
        
        random_search()
        #space = {'lr':.0001, 'frozen_blocks':list(range(8))}
        #results = objective(space)
    

if __name__=='__main__':
    pass
    
   