<a href="https://colab.research.google.com/github/thinkingmachines/nlp-tutorials/blob/master/nlp_classification_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT for text classification

## Fine-tuning


In [0]:
!pip install tensorboardX
!pip install pytorch-transformers



In [0]:
# Forked pytorch-transformers to make a slight edit to one of the DataProcessor classes
!git clone https://github.com/thinkingmachines/pytorch-transformers.git

fatal: destination path 'pytorch-transformers' already exists and is not an empty directory.


In [0]:
# Authenticate to GCS.
from google.colab import auth
auth.authenticate_user()

In [0]:
def dl_gcs(object_fp, dest_fp, bucket_name='nlp-experiment-datasets', project_id='bert-experiments'):
    from googleapiclient.discovery import build
    gcs_service = build('storage', 'v1')
    from apiclient.http import MediaIoBaseDownload
    with open(dest_fp, 'wb') as f:
        # Download the file from a given Google Cloud Storage bucket.
        request = gcs_service.objects().get_media(bucket=bucket_name,
                                            object=f'{object_fp}')
        media = MediaIoBaseDownload(f, request)

        done = False
        while not done:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
            _, done = media.next_chunk()        
    print('Download complete')

In [0]:
mkdir cola

In [0]:
dl_gcs('bert_toxic_class/train_10k.csv', 'cola/train.csv')

Download complete


In [0]:
dl_gcs('bert_toxic_class/test_10k.csv', 'cola/test.csv')

Download complete


In [0]:
dl_gcs('bert_toxic_class/dev_1k.csv', 'cola/dev.csv')

Download complete


In [0]:
ls

adc.json  [0m[01;34mcola[0m/  [01;34mpytorch-transformers[0m/  [01;34msample_data[0m/


In [0]:
import pandas as pd
import numpy as np

In [0]:
# Setting chunksize for testing purposes
train = pd.read_csv(f'cola/train.csv', engine='python')
test = pd.read_csv(f'cola/test.csv', engine='python')
dev = pd.read_csv(f'cola/dev.csv', engine='python')
print(train.shape)
# remove new lines etc.

train['comment_text'] = train['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True).replace(r'\t',  ' ', regex=True)
test['comment_text'] = test['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True).replace(r'\t',  ' ', regex=True)
dev['comment_text'] = dev['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True).replace(r'\t',  ' ', regex=True)

# force train into cola format, test is fine as it is

train['dummy_1'] = 'meh'
train['dummy_2'] = '*'

dev['dummy_1'] = 'meh'
dev['dummy_2'] = '*'

# Make sure that the target is on col index 1 and text is on col index 3
train = train[['dummy_1','target','dummy_2','comment_text']]
train['target'] = np.where(train['target']>=0.5,1,0)

dev = dev[['dummy_1','target','dummy_2','comment_text']]
dev['target'] = np.where(dev['target']>=0.5,1,0)

# export as tab seperated

#train = train[~train.index.isin([320, 321, 322, 323, 1190, 1191, 6412, 6413, 6414, 6415, 6416])]
print(train.shape)
train.to_csv('cola/train.tsv', sep='\t', index=False, header=False)
test.to_csv('cola/test.tsv', sep='\t', index=False, header=True)
dev.to_csv('cola/dev.tsv', sep='\t', index=False, header=False)

(10000, 45)
(10000, 4)


In [0]:
# Batch size originally at 8
# take out --overwrite_output_dir if you don't want to oerwrite
# add --do_eval to evaluate after training
# We can set model_type to either bert or xlnet
!python pytorch-transformers/examples/run_glue.py \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --task_name CoLA \
    --do_train \
    --model_type bert \
    --do_lower_case \
    --data_dir cola \
    --max_seq_length 128 \
    --per_gpu_eval_batch_size=32   \
    --per_gpu_train_batch_size=32   \
    --learning_rate 2e-5 \
    --num_train_epochs 1.0 \
    --output_dir /tmp/CoLA/ \
    --overwrite_output_dir

07/17/2019 16:26:11 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
07/17/2019 16:26:11 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "cola",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

07/17/2019 16:26:12 - INFO - pytorch_transformers.tokenization_utils -   loading

## Evaluation

In [0]:
# Batch size originally at 8
# Out of sample accuracy is at 44%, which is not so good, but this is expected given we only used 10k training examples out of 2M
!python pytorch-transformers/examples/run_glue.py \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --task_name CoLA \
    --do_eval \
    --do_lower_case \
    --data_dir cola \
    --max_seq_length 128 \
    --per_gpu_eval_batch_size=32   \
    --per_gpu_train_batch_size=32   \
    --learning_rate 2e-5 \
    --num_train_epochs 1.0 \
    --output_dir /tmp/CoLA/

07/17/2019 16:18:44 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
07/17/2019 16:18:44 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "cola",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

07/17/2019 16:18:44 - INFO - pytorch_transformers.tokenization_utils -   loading