In [78]:
from transformers import BertTokenizer
from pathlib import Path
import torch

from box import Box
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
#import apex
from sklearn.model_selection import train_test_split
import os
import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc


In [79]:
torch.cuda.empty_cache()

In [80]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [82]:
#DATA_PATH = Path('../sample_data/multi_label_toxic_comments/data/')
#LABEL_PATH = Path('../sample_data/multi_label_toxic_comments/label/')

DATA_PATH = Path('.')
LABEL_PATH = Path('.')

#AUG_DATA_PATH = Path('../data/data_augmentation/')

MODEL_PATH=Path('../models/')
LOG_PATH=Path('../logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None

# BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/cased_L-12_H-768_A-12/')
BERT_PRETRAINED_PATH = None
#BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/')
# BERT_PRETRAINED_PATH = Path('../../bert_fastai/pretrained-weights/uncased_L-24_H-1024_A-16/')
# FINETUNED_PATH = Path('../models/finetuned_model.bin')
FINETUNED_PATH = None
# model_state_dict = torch.load(FINETUNED_PATH)

LOG_PATH.mkdir(exist_ok=True)

OUTPUT_PATH = MODEL_PATH/'output'
OUTPUT_PATH.mkdir(exist_ok=True)

In [83]:
args = Box({
    "run_text": "multilabel toxic comments with freezable layers",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "toxic_classification_lib",
    "no_cuda": False,
    "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 32,
    "eval_batch_size": 16,
    "learning_rate": 5e-5,
    "num_train_epochs": 1,
    "warmup_proportion": 0.0,
    "no_cuda": True,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": True,
    "fp16": False,
    "fp16_opt_level": "O1",
    "weight_decay": 0.0,
    "adam_epsilon": 1e-8,
    "max_grad_norm": 1.0,
    "max_steps": -1,
    "warmup_steps": 500,
    "logging_steps": 50,
    "eval_all_checkpoints": True,
    "overwrite_output_dir": True,
    "overwrite_cache": False,
    "seed": 42,
    "loss_scale": 128,
    "task_name": 'intent',
    "model_name": 'xlnet-base-cased',
    "model_type": 'xlnet'
})

In [84]:
import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [85]:
logger.info(args)

I1211 00:38:02.011733 4420552128 <ipython-input-85-a9af49a5ea87>:1] {'run_text': 'multilabel toxic comments with freezable layers', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('../logs'), 'full_data_dir': PosixPath('.'), 'data_dir': PosixPath('.'), 'task_name': 'intent', 'no_cuda': True, 'bert_model': None, 'output_dir': PosixPath('../models/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 32, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 1, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': True, 'fp16': False, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_scale': 128, 'model_name': 'xlnet-base-cased', 'model_type': 'xlnet'}


In [86]:
# tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_PATH, do_lower_case=args['do_lower_case'])

In [87]:
#device = torch.device('cuda')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

In [91]:
#label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
label_cols = ["functionality", "range_anxiety", "availability", "cost", "ui", "location", "service_time", "dealership"]

In [90]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train_sample.csv', val_file='val_sample.csv',
                          test_data='test.csv',
                          text_col="comment_text", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

I1211 00:40:30.686882 4420552128 tokenization_utils.py:379] loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /Users/sameerdharur/.cache/torch/transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8


FileNotFoundError: [Errno 2] File b'./train_sample.csv' does not exist: b'./train_sample.csv'

In [93]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train.csv', val_file='valid.csv',
                          test_data='test.csv',
                          text_col="review", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

I1211 00:41:17.047151 4420552128 tokenization_utils.py:379] loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /Users/sameerdharur/.cache/torch/transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8
I1211 00:41:17.113462 4420552128 data_cls.py:450] Loading features from cached file cache/cached_xlnet_train_multi_label_512_train.csv
I1211 00:41:21.323727 4420552128 data_cls.py:109] Writing example 0 of 851
I1211 00:41:21.543081 4420552128 data_cls.py:474] Saving features into cached file cache/cached_xlnet_dev_multi_label_512_valid.csv
I1211 00:41:22.064291 4420552128 data_cls.py:109] Writing example 0 of 8
I1211 00:41:22.065834 4420552128 data_cls.py:474] Saving features into cached file cache/cached_xlnet_test_multi_label_512_test


In [94]:
databunch.train_dl.dataset[0][3]

tensor([0., 0., 0., 0., 0., 1., 0., 0.])

In [64]:
# train_df.head(20)
# databunch = BertDataBunch.load(args['data_dir'])

In [95]:
num_labels = len(databunch.labels)
num_labels

8

In [66]:
# databunch.train_dl.dataset[10]

In [67]:
# torch.distributed.init_process_group(backend="nccl", 
#                                      init_method = "tcp://localhost:23459", 
#                                      rank=0, world_size=1)

In [96]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

In [97]:
print(device)

cpu


In [98]:
learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, 
                                            device=device, logger=logger, output_dir=args.output_dir, 
                                            finetuned_wgts_path=FINETUNED_PATH, warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, is_fp16=args.fp16, 
                                            multi_label=True, logging_steps=0)



I1211 00:41:41.266175 4420552128 configuration_utils.py:157] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /Users/sameerdharur/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f
I1211 00:41:41.271522 4420552128 configuration_utils.py:174] Model config {
  "attn_type": "bi",
  "bi_data": false,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "ff_activation": "gelu",
  "finetuning_task": null,
  "initializer_range": 0.02,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "n_head": 12,
  "n_layer": 12,
  "n_token": 32000,
  "num_labels": 8,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activati

xlnet-base-cased
<class 'str'>


I1211 00:41:44.308390 4420552128 modeling_utils.py:465] Weights of XLNetForMultiLabelSequenceClassification not initialized from pretrained model: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
I1211 00:41:44.309154 4420552128 modeling_utils.py:468] Weights from pretrained model not used in XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']


In [99]:
learner.fit(args.num_train_epochs, args.learning_rate, validate=True)

I1211 00:41:49.226248 4420552128 learner_cls.py:336] ***** Running training *****
I1211 00:41:49.227175 4420552128 learner_cls.py:337]   Num examples = 6811
I1211 00:41:49.227977 4420552128 learner_cls.py:338]   Num Epochs = 1
I1211 00:41:49.229157 4420552128 learner_cls.py:341]   Total train batch size (w. parallel, distributed & accumulation) = 32
I1211 00:41:49.230348 4420552128 learner_cls.py:344]   Gradient Accumulation steps = 1
I1211 00:41:49.233310 4420552128 learner_cls.py:346]   Total optimization steps = 213


../models/output/tensorboard


KeyboardInterrupt: 

In [None]:
learner.validate()

In [None]:
learner.save_model()

In [None]:
learner.predict_batch(list(pd.read_csv('../data/test.csv')['comment_text'].values))

08/01/2019 21:21:10 - INFO - root -   Loading features from cached file ../data/cache/cached_train_multi_label_512


153164