Train the selector model on Google Colab

Similar assumptions about your environment to those in `generator.ipynb`

In [None]:
!nvidia-smi

In [None]:
!pip install transformers==2.10.0

!pip install --upgrade simpletransformers


!pip install --upgrade scikit-learn

In [None]:
%tensorflow_version 1.x

In [None]:
import numpy as np, pandas as pd
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd "/content/drive/My Drive/gpt-2"

In [None]:
from __future__ import absolute_import, division, print_function

import os
import math
import json
import random
import warnings

from multiprocessing import cpu_count

import torch
import numpy as np

from scipy.stats import pearsonr, mode
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, label_ranking_average_precision_score
from tensorboardX import SummaryWriter
from tqdm.auto import trange, tqdm

from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import (
    DataLoader,
    RandomSampler,
    SequentialSampler,
    TensorDataset
)

from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
from transformers import (
    WEIGHTS_NAME,
    BertConfig, BertTokenizer,
    XLNetConfig, XLNetTokenizer,
    XLMConfig, XLMTokenizer,
    RobertaConfig, RobertaTokenizer,
    DistilBertConfig, DistilBertTokenizer,
    AlbertConfig, AlbertTokenizer,
    CamembertConfig, CamembertTokenizer
)

from simpletransformers.classification.classification_utils import (
    InputExample,
    convert_examples_to_features
)

from simpletransformers.classification.transformer_models.bert_model import BertForSequenceClassification
from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification
from simpletransformers.classification.transformer_models.xlm_model import XLMForSequenceClassification
from simpletransformers.classification.transformer_models.xlnet_model import XLNetForSequenceClassification
from simpletransformers.classification.transformer_models.distilbert_model import DistilBertForSequenceClassification
from simpletransformers.classification.transformer_models.albert_model import AlbertForSequenceClassification
from simpletransformers.classification.transformer_models.camembert_model import CamembertForSequenceClassification

from simpletransformers.classification import ClassificationModel, MultiLabelClassificationModel

from sklearn.model_selection import train_test_split

BERT_CONFIG_DEFAULT = {"model_type": "roberta", 
                       "model_name": "roberta-base",
                       "args": {'reprocess_input_data': True, "fp16": False, 
                                'train_batch_size': 16, 'eval_batch_size': 16,
                                'gradient_accumulation_steps': 1,
                                'learning_rate': 1e-5, 
                                'max_seq_length': 256,
                                'sliding_window': False,
                                'num_train_epochs': 7,
                                'warmup_steps': 0,
                                'warmup_ratio': 0.1,
                                'weight_decay': 0.1,#0.05,
                                'logging_steps': 0,
                                'max_grad_norm': 10000.,
                                'adam_epsilon': 1e-6,
                                'silent': False,
                                'overwrite_output_dir': True},
                       "kwargs": {"use_cuda": True, 
                                  "num_labels": 1}}

In [None]:
BERT_CONFIG_EXP = {"model_type": "roberta", 
                       "model_name": "roberta-large",
                       "args": {'reprocess_input_data': True, "fp16": False, 
                                'train_batch_size': 8, 'eval_batch_size': 8,
                                'gradient_accumulation_steps': 2,
                                'learning_rate': 1e-5, 
                                'max_seq_length': 256,
                                'sliding_window': False,
                                'num_train_epochs': 4,
                                'warmup_steps': 0,
                                'warmup_ratio': 0.06,
                                'weight_decay': 0.025,
                                'logging_steps': 0,#51,
                                'max_grad_norm': 10000.,
                                'adam_epsilon': 1e-6,
                                'silent': False,
                                'overwrite_output_dir': True,
                                'evaluate_during_training': False,
                                'use_early_stopping': False,
                                'save_model_every_epoch': False,
                                'save_optimizer_and_scheduler': False,
                                'save_steps': 0,
                                },
                       "kwargs": {"use_cuda": True, 
                                  "num_labels": 1}}

In [None]:
from copy import deepcopy
from datetime import datetime

def make_bert(output_dir: str, bert_config: dict=BERT_CONFIG_DEFAULT, add_timestamp=False, regression=True, overrides: dict=None):
    bert_config_ = deepcopy(bert_config)

    output_dir_ = output_dir
    if add_timestamp:
      output_dir_ += "_" + datetime.now().strftime("%H-%M-%S")
    bert_config_["args"]["output_dir"] = output_dir_
    bert_config_["args"]["best_model_dir"] = output_dir_ + "/best_model"

    if regression or bert_config['args'].get('multi_label'):
      bert_config_["kwargs"]["num_labels"] = 1
    else:
      bert_config_["kwargs"]["num_labels"] = 2

    if overrides is not None:
      for k, v in overrides.items():
        bert_config_["args"][k] = v
    
    if regression:
      constructor = RegressionMode
    elif bert_config['args'].get('multi_label'):
      constructor = MultiLabelClassificationModel
      print('using MultiLabelClassificationModel')
    else:
      constructor = ClassificationModel
    bert = constructor(model_type=bert_config_["model_type"], 
                            model_name=bert_config_["model_name"],
                            args=bert_config_["args"], 
                            **bert_config_["kwargs"])
    return bert

In [None]:
data_path = "reward/reward.pkl.gz"
with open(data_path, "rb") as f:
    ids_to_reward_data = pickle.load(f)["ids_to_reward_data"]

In [None]:
import re
def inverse_format_post_for_api(post):
    if post.startswith("<p>"):
        post = post[len("<p>"):]
    if post.endswith("</p>"):
        post = post[:-len("</p>")]
    # post = post.lstrip("<p>").rstrip("</p>")
    post = re.sub(r"</p><p>", "\n", post)
    post = re.sub(r"<br>", "\n", post)
    return post

def make_train_data(ids_to_reward_data, continuation_only=True):
    train_data = []
    for k, v in ids_to_reward_data.items():
      if continuation_only:
        train_data.append([k, v["continuation"], v["note_count"]])
      else:
        train_data.append([k, " ".join(v["prompt"].split(" ")[-64:]) + v["continuation"], v["note_count"]])
        
    train_data = pd.DataFrame(train_data, columns=["id", "text", "note_count"])

    train_data.text = train_data.text.apply(inverse_format_post_for_api)

    return train_data

In [None]:
train_data = make_train_data(ids_to_reward_data, continuation_only=True)
train_data.note_count.describe()

In [None]:
temporally_ordered_train_data = train_data.sort_values(by="id").reset_index()

In [None]:
def non_overlapping_ma(array, width=31):
  return pd.Series([np.average(array[ix:ix+width], )
   for ix in range(0, len(array), width)])

window_width = 140
window_halfw = window_width//2

skip_n_most_recent = 40

allow_partial_windows = False
window_frac_left = 0.8 # None

rolling_quantiles = {}
rolling_advantages = {}

if window_frac_left is not None:
  window_shift_left = -1*int(window_frac_left*window_width)
  window_shift_right = window_width + window_shift_left
else:
  window_shift_left = -window_halfw
  window_shift_right = window_halfw

last_ix_allowed = len(temporally_ordered_train_data) - skip_n_most_recent

if allow_partial_windows:
  ixs = temporally_ordered_train_data.index[:last_ix_allowed]
else:
  ixs = temporally_ordered_train_data.index[(0-window_shift_left):(last_ix_allowed-window_shift_right)]

print(f"using ({ixs.min()} to {ixs.max()}) of (0 to {len(temporally_ordered_train_data)-1})")

for ix in ixs:
  point = temporally_ordered_train_data.loc[ix, 'note_count']
  window = temporally_ordered_train_data.loc[ix+window_shift_left:ix+window_shift_right, 'note_count']
  rolling_quantiles[ix] = (point>=window).mean()
  rolling_advantages[ix] = (point-window).mean()

rolling_quantiles = pd.Series(rolling_quantiles)
rolling_advantages = pd.Series(rolling_advantages)

non_overlapping_ma(rolling_quantiles, width=21).plot(lw=1, ls='--', marker='.', markersize=5, figsize=(10, 6));

In [None]:
use_mov_avg = True
notes_key = "rolling_quantile" if use_mov_avg else "note_count"

if use_mov_avg:
  train_data_ = temporally_ordered_train_data.loc[rolling_quantiles.index]
  train_data_["rolling_quantile"] = rolling_quantiles
  train_data_["rolling_advantage"] = rolling_advantages
else:
  train_data_ = temporally_ordered_train_data

regression = False
drop_midrange = True
smaller_midrange_dropped = False

reg_log = False
reg_cutoff = 30

continuation_only = True

if drop_midrange and not use_mov_avg:
  train_data_["target"] = (train_data_[notes_key]>=4).astype(int)
  train_data_ = train_data_[(train_data_[notes_key] <= 1) | (train_data_[notes_key] >=4)]
  stratify = train_data_["target"]
elif drop_midrange and use_mov_avg:
  if smaller_midrange_dropped:
    MIDRANGE_BOTTOM = np.percentile(train_data_[notes_key], 30)
    MIDRANGE_TOP = np.percentile(train_data_[notes_key], 70)
  else:
    MIDRANGE_BOTTOM = np.percentile(train_data_[notes_key], 24)
    MIDRANGE_TOP = np.percentile(train_data_[notes_key], 76)

  train_data_["target"] = (train_data_[notes_key] >= MIDRANGE_TOP).astype(int)
  train_data_ = train_data_[(train_data_[notes_key] <= MIDRANGE_BOTTOM) | (train_data_[notes_key] >= MIDRANGE_TOP)]
  stratify = train_data_["target"]
else:
# split at middle
  train_data_["target"] = (train_data_[notes_key] > 2).astype(int)
  train_data_ = train_data_
  stratify = train_data_["target"]


model_inputs = train_data_[["text", "target"]]

In [None]:
model_inputs.target.describe()

In [None]:
def baserate_loss(target):
  baserate = np.mean(target)

  return -1 * (baserate*np.log(baserate) + (1-baserate)*np.log(1-baserate))

def baserate_loss_regression(target, ref=None,):
  if ref is not None:
    baserate = np.mean(ref)
  else:
    baserate = np.mean(target)

  return np.mean((target-baserate)**2)

In [None]:
def hack_to_avoid_masking(text, tokenizer, max_len=768):
  # this was a bad idea, don't use it
  tokens = tokenizer.tokenize(text)
  npad = max_len - len(tokens)
  pads = " ".join([tokenizer.pad_token for _ in range(npad)])

  padded = text + " " + pads
  return padded

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.compose import TransformedTargetRegressor

from scipy.special import softmax


class SimpleTransformerClassificationEstimator(BaseEstimator):
  def __init__(self, model_maker, avoid_masking=False, dev_size=0.2, extra_metrics={}):
    self.model_maker = model_maker
    self.avoid_masking = avoid_masking
    self.dev_size = dev_size
    self.extra_metrics = extra_metrics

    self.model_ = None
    self.classes_ = None

  def fit(self, X, y):
    train_df = pd.DataFrame({"text": X, "target": y})[["text", "target"]]
    self.model_ = self.model_maker()
    if self.avoid_masking:
      train_df.text = train_df.text.apply(hack_to_avoid_masking, tokenizer=self.model_.tokenizer,)
    
    st_eval_df = None
    if self.model_.args.get('use_early_stopping'):
      print(f"using early stopping")
      train_df, st_eval_df = train_test_split(train_df, test_size=self.dev_size, stratify=train_df["target"])
      train_baserate_acc = max(train_df.target.mean(), 1.-train_df.target.mean())
      eval_baserate_acc = max(st_eval_df.target.mean(), 1.-st_eval_df.target.mean())
      print(f"using\n\ttrain_df: {train_df.shape}, baserate {train_baserate_acc:.3f}\n\tst_eval_df: {st_eval_df.shape}, baserate {eval_baserate_acc:.3f}")
    
    if self.model_.args.get("multi_label"):
      train_df["target"] = [[t] for t in train_df["target"]]
      print(f"using multi_label for train_df")
      display(train_df.head())
      if st_eval_df is not None:
        st_eval_df["target"] = [[t] for t in st_eval_df["target"]]
        print(f"using multi_label for st_eval_df")
        display(st_eval_df.head())

    self.model_.train_model(train_df, eval_df=st_eval_df, **self.extra_metrics)
    if self.model_.args.get('use_early_stopping'):
      # load best model
      best_model_dir = self.model_.args.get('best_model_dir')
      model_type = self.model_.args["model_type"] 
      kwargs = {"use_cuda": True, "num_labels": 1 if self.model_.args.get("multi_label") else 2}

      del self.model_
      torch.cuda.empty_cache()
      self.model_ = ClassificationModel(model_type, best_model_dir, **kwargs)

    self.classes_ = [0, 1]
    return self

  def predict(self, X_):
    if self.avoid_masking:
      X = [hack_to_avoid_masking(t, self.model_.tokenizer,) for t in X_]
    else:
      X = X_
    preds, logits = self.model_.predict(X)
    return preds

  def predict_proba(self, X_):
    if self.avoid_masking:
      X = [hack_to_avoid_masking(t, self.model_.tokenizer,) for t in X_]
    else:
      X = X_
    preds, logits = self.model_.predict(X)
    if self.model_.args['sliding_window']:
      logits = [np.mean(l, axis=0) for l in logits]
    if self.model_.args.get("multi_label"):
      proba = logits
    else:
      proba = softmax(logits, axis=1)
    return proba

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_gamma_deviance, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, log_loss, matthews_corrcoef
from sklearn.metrics import average_precision_score, brier_score_loss, hinge_loss

from scipy.stats import spearmanr

def spearman_score(y_true, y_pred):
  return spearmanr(y_true, y_pred)[0]

scoring = ["neg_brier_score", "average_precision", "accuracy"]

In [None]:
baserate_brier = -1*brier_score_loss(model_inputs.target, [model_inputs.target.mean() for _ in range(len(model_inputs.target))])

print(f"baserate_loss (all): {baserate_loss(model_inputs.target):.3f}")
print(f"baserate_brier (all): {baserate_brier:.3f}")
print(f"baserate_acc (all): {max(model_inputs.target.mean(), 1.-model_inputs.target.mean()):.3f}")
print()

In [None]:
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold
tss = TimeSeriesSplit(n_splits=4)

_custom_tss = list(tss.split(model_inputs.target.values))[3:]
custom_tss = [(train_index, _custom_tss[-1][1]) 
              for outer_train_index, _ in _custom_tss
              for train_index, __ in StratifiedKFold(n_splits=4, shuffle=True).split(outer_train_index, model_inputs.target.values[outer_train_index])
              ]

for train_index, test_index in custom_tss:
  print(f"TRAIN {train_index.min()} to {train_index.max()}: \t{model_inputs.target.values[train_index].mean():.2f}")
  print(f"TRAIN\tsize {len(train_index)}")
  print(f"TEST  {test_index.min()} to {test_index.max()}: \t{model_inputs.target.values[test_index].mean():.2f}")
  print(f"TEST\tsize {len(test_index)}")
  print()

print(f"ALL: \t\t\t{model_inputs.target.values.mean():.2f}")

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
# if you want to cross-validate
do_cv = False
use_tss_with_cv = False
score_cv_multimetric=True

if do_cv:
  torch.cuda.empty_cache()
  
  wrapped = SimpleTransformerClassificationEstimator(model_maker=lambda overrides=None:make_bert("bert_cv", bert_config=BERT_CONFIG_EXP, add_timestamp=True, regression=False,
                                                                                                   overrides=overrides))

  cv_results = cross_validate(wrapped, 
                              X=model_inputs.text.values, 
                              y=model_inputs.target.values,
                              scoring=scoring if score_cv_multimetric else ["accuracy"], 
                              cv=tss if use_tss_with_cv else 4, 
                              return_train_score=True)
  
  print(cv_results)
  print('mean')
  display(pd.DataFrame(cv_results).mean().sort_index())
  print('std')
  display(pd.DataFrame(cv_results).std().sort_index())

  !rm -r bert_cv*

In [None]:
display(pd.DataFrame(cv_results).std().sort_index())

Fit and save

In [None]:
final_fit_name = ""  # fill in
DEV_SIZE=0.2

In [None]:
torch.cuda.empty_cache()
! rm -r cache_dir/*

wrapped = SimpleTransformerClassificationEstimator(model_maker=lambda:make_bert(final_fit_name, bert_config=BERT_CONFIG_EXP, 
                                                                                add_timestamp=False, regression=False),
                                                    dev_size=DEV_SIZE)
  
wrapped.fit(model_inputs.text.values, model_inputs.target.values)


In [None]:
# for verifying the model works locally -- try on some examples you have saved in a pickle file
import pickle
from textwrap import wrap

with open("reward/textpost_examples.pkl.gz", "rb") as f:
  textpost_examples = pickle.load(f)
textpost_examples = [s.lstrip("翰") for s in textpost_examples]

proba_tpe = wrapped.predict_proba(textpost_examples)[:, 1]


def show_note_probas(texts, probas):
  for tpe, proba in zip(texts, probas):
    print(f"\tpredicted prob: {proba:.1%}\n")
    print("\n~_~_~_~_~_\n")
    print("\n".join(wrap(tpe)))
    print("\n~_~_~_~_~_\n")

show_note_probas(textpost_examples, proba_tpe)

In [None]:
# can't remember if this is needed -- i think it was at one point
if not os.path.exists(final_fit_name):
  os.mkdir(final_fit_name)

model_to_save = wrapped.model_.model.module if hasattr(wrapped.model_.model, "module") else wrapped.model_.model
model_to_save.save_pretrained(final_fit_name)
wrapped.model_.tokenizer.save_pretrained(final_fit_name)
torch.save(wrapped.model_.args, os.path.join(final_fit_name, "training_args.bin"))
