<a href="https://colab.research.google.com/github/areias/bert_covid_sentiment/blob/main/bert_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
from transformers import (
   AutoConfig,
   AutoTokenizer,
   TFAutoModelForSequenceClassification,
   AdamW,
   glue_convert_examples_to_features
)
import tensorflow as tf
import tensorflow_datasets as tfds
import json

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("train.csv")

In [4]:
df.head()

Unnamed: 0,id,text,label,created_at
0,564984221203431424,Disneyland is spreading measles 121 cases in 1...,1,2015-02-10 03:08:29+00:00
1,564984308096438272,"California measles outbreak: 123 cases, latest...",0,2015-02-10 03:08:50+00:00
2,564985199700611072,Why isn't the news talking about how measles w...,1,2015-02-10 03:12:23+00:00
3,564985637321703425,"No link to autism from vaccines, says expert: ...",1,2015-02-10 03:14:07+00:00
4,564986465822580736,We need a @VaccinatorX assembly at my kids' sc...,1,2015-02-10 03:17:25+00:00


In [5]:
df.label.value_counts()

 1    4037
 0    4029
-1     934
Name: label, dtype: int64

In [6]:
# id,label,text
df=df.loc[:,["id","label", "text"]]

In [7]:
import numpy as np

In [8]:
"""
    60% - train set,
    20% - dev/validation set,
    20% - test set"""

train, dev, test = np.split(df.sample(frac=1, random_state=42), 
                       [int(.6*len(df)), int(.8*len(df))])

In [9]:
dev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1800 entries, 5743 to 7921
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1800 non-null   int64 
 1   label   1800 non-null   int64 
 2   text    1800 non-null   object
dtypes: int64(2), object(1)
memory usage: 56.2+ KB


In [15]:
import os 
os.makedirs("data/finetune/originals/crowdbreaks")

In [18]:
train.to_csv("data/finetune/originals/crowdbreaks/train.tsv", sep="\t",index=False)
dev.to_csv("data/finetune/originals/crowdbreaks/dev.tsv", sep="\t",index=False)
test.to_csv("data/finetune/originals/crowdbreaks/test.tsv", sep="\t",index=False)


In [19]:
# from https://github.com/digitalepidemiologylab/covid-twitter-bert/blob/master/preprocess/create_finetune_data.py

In [160]:
# args
"""
cd preprocess
python create_finetune_data.py \
  --run_prefix test_run \
  --finetune_datasets <dataset_name> \
  --model_class bert_large_uncased_wwm \
  --max_seq_length 96 \
  --asciify_emojis \
  --username_filler twitteruser \
  --url_filler twitterurl \
  --replace_multiple_usernames \
  --replace_multiple_urls \
  --remove_unicode_symbols"
"""
import collections
from collections import namedtuple
arguments = collections.namedtuple('args', ['run_prefix', 'finetune_datasets', 'model_class',
                                            'max_seq_length' , 'asciify_emojis','username_filler', 'replace_urls',
                                            'url_filler', 'replace_usernames','replace_multiple_usernames',
                                            'replace_multiple_urls','remove_unicode_symbols','standardize_punctuation',
                                            'remove_accented_characters'])


In [161]:
args= arguments('test_run', ["crowdbreaks"], "covid-twitter-bert-2",
                 96,True,"twitteruser",True,
                "twitterurl",True, True,True,True,True,True)

In [128]:
args

args(run_prefix='test_run', finetune_datasets=['crowdbreaks'], model_class='covid-twitter-bert-2', max_seq_length=96, asciify_emojis=True, username_filler='twitteruser', replace_urls=True, url_filler='twitterurl', replace_usernames=True, replace_multiple_usernames=True, replace_multiple_urls=True, remove_unicode_symbols=True)

In [95]:
REQUIRED_COLUMNS = ['id', 'label', 'text']
DATA_DIR = os.path.join('data')
VOCAB_PATH = os.path.join('vocabs')


In [96]:
import datetime
def get_run_name(args):
    # Use timestamp to generate a unique run name
    ts = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S_%f')
    if args.run_prefix:
        run_name = f'run_{ts}_{args.run_prefix}'
    else:
        run_name = f'run_{ts}'
    return run_name

In [97]:
run_name = get_run_name(args)
run_name

'run_2021-11-17_18-22-36_216853_test_run'

In [98]:
run_dir = os.path.join(DATA_DIR, 'finetune', run_name)
run_dir

'data/finetune/run_2021-11-17_18-22-36_216853_test_run'

In [99]:
if not os.path.isdir(run_dir):
  os.makedirs(run_dir)

In [100]:
# find input data
originals_dir = os.path.join(DATA_DIR, 'finetune', 'originals')
originals_dir   

'data/finetune/originals'

In [101]:
if args.finetune_datasets is None or len(args.finetune_datasets) == 0:
    finetune_datasets = os.listdir(originals_dir)
else:
    finetune_datasets = args.finetune_datasets
finetune_datasets

['crowdbreaks']

In [102]:
# Pretrained models configuration, add model configuration here

PRETRAINED_MODELS = {
        'bert_large_uncased': {
            'bucket_location': 'pretrained_models/bert/keras_bert/uncased_L-24_H-1024_A-16',
            'hub_url': 'tensorflow/bert_en_uncased_L-24_H-1024_A-16/2',
            'config': 'bert_config_large_uncased.json',
            'is_tfhub_model': True,
            'vocab_file': 'bert-large-uncased-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': False
            },
        'bert_multi_cased': {
            'bucket_location': 'pretrained_models/bert/keras_bert/multi_cased_L-12_H-768_A-12',
            'hub_url': 'tensorflow/bert_multi_cased_L-12_H-768_A-12/2',
            'config': 'bert_config_multi_cased.json',
            'is_tfhub_model': True,
            'vocab_file': 'bert-multi-cased-vocab.txt',
            'lower_case': False,
            'do_whole_word_masking': False
            },
        'bert_large_uncased_wwm': {
            'bucket_location': 'pretrained_models/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16',
            'hub_url': 'tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/2',
            'config': 'bert_config_large_uncased_wwm.json',
            'is_tfhub_model': True,
            'vocab_file': 'bert-large-uncased-whole-word-masking-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': True
            },
        'covid-twitter-bert': {
            'hub_url': 'digitalepidemiologylab/covid-twitter-bert/1',
            'is_tfhub_model': True,
            'config': 'bert_config_covid_twitter_bert.json',
            'vocab_file': 'bert-large-uncased-whole-word-masking-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': True
            },
        'covid-twitter-bert-2': {
            'hub_url': 'digitalepidemiologylab/covid-twitter-bert/2',
            'is_tfhub_model': True,
            'config': 'bert_config_covid_twitter_bert.json',
            'vocab_file': 'bert-large-uncased-whole-word-masking-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': True
            }
        }

In [103]:
# https://tfhub.dev/digitalepidemiologylab/covid-twitter-bert/2




In [104]:
do_lower_case = PRETRAINED_MODELS[args.model_class]['lower_case']
do_lower_case


True

In [105]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)-5.5s] [%(name)-12.12s]: %(message)s')

for dataset in finetune_datasets:
      logger.info(f'Processing dataset {dataset}...')
      preprocessed_folder = os.path.join(run_dir, dataset, 'preprocessed')
      if not os.path.isdir(preprocessed_folder):
          os.makedirs(preprocessed_folder)


2021-11-17 18:22:45,359 [INFO ] [__main__    ]: Processing dataset crowdbreaks...


In [106]:
# compile regexes
import re
username_regex = re.compile(r'(^|[^@\w])@(\w{1,15})\b')
url_regex = re.compile(r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))')
control_char_regex = re.compile(r'[\r\n\t]+')
# translate table for punctuation
transl_table = dict([(ord(x), ord(y)) for x, y in zip(u"‘’´“”–-",  u"'''\"\"--")])
# HTML parser
html_parser = HTMLParser()

In [107]:
import unicodedata
def standardize_text(text):
    """
    1) Escape HTML
    2) Replaces some non-standard punctuation with standard versions. 
    3) Replace \r, \n and \t with white spaces
    4) Removes all other control characters and the NULL byte
    5) Removes duplicate white spaces
    """
    html_parser = HTMLParser()

    # escape HTML symbols
    text = html_parser.unescape(text)
    # standardize punctuation
    text = text.translate(transl_table)
    text = text.replace('…', '...')
    # replace \t, \n and \r characters by a whitespace
    text = re.sub(control_char_regex, ' ', text)
    # remove all remaining control characters
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')
    # replace multiple spaces with single space
    text = ' '.join(text.split())
    return text.strip()

In [114]:

def replace_usernames(text, filler='user'):
    # @<user> is a marker used internally. use filler instead
    text = text.replace('@<user>', f'{filler}')
    # replace other user handles by filler
    text = re.sub(username_regex, filler, text)
    # add spaces between, and remove double spaces again
    text = text.replace(filler, f' {filler} ')
    text = ' '.join(text.split())
    return text


In [118]:
def replace_urls(text, filler='url'):
    # <url> is a marker used internally. use filler instead
    text = text.replace('<url>', filler)
    # replace other urls by filler
    text = re.sub(url_regex, filler, text)
    # add spaces between, and remove double spaces again
    text = text.replace(filler, f' {filler} ')
    text = ' '.join(text.split())
    return text

In [135]:
!pip install emoji
import emoji
def asciify_emojis(text):
    """
    Converts emojis into text aliases. E.g. 👍 becomes :thumbs_up:
    For a full list of text aliases see: https://www.webfx.com/tools/emoji-cheat-sheet/
    """
    text = emoji.demojize(text)
    return text

Collecting emoji
  Downloading emoji-1.6.1.tar.gz (170 kB)
[?25l[K     |██                              | 10 kB 22.5 MB/s eta 0:00:01[K     |███▉                            | 20 kB 25.1 MB/s eta 0:00:01[K     |█████▉                          | 30 kB 16.8 MB/s eta 0:00:01[K     |███████▊                        | 40 kB 14.6 MB/s eta 0:00:01[K     |█████████▋                      | 51 kB 5.7 MB/s eta 0:00:01[K     |███████████▋                    | 61 kB 6.0 MB/s eta 0:00:01[K     |█████████████▌                  | 71 kB 5.4 MB/s eta 0:00:01[K     |███████████████▍                | 81 kB 6.1 MB/s eta 0:00:01[K     |█████████████████▍              | 92 kB 6.4 MB/s eta 0:00:01[K     |███████████████████▎            | 102 kB 5.4 MB/s eta 0:00:01[K     |█████████████████████▏          | 112 kB 5.4 MB/s eta 0:00:01[K     |███████████████████████▏        | 122 kB 5.4 MB/s eta 0:00:01[K     |█████████████████████████       | 133 kB 5.4 MB/s eta 0:00:01[K     |███████

In [147]:
!pip install unidecode
import unidecode
def standardize_punctuation(text):
    return ''.join([unidecode.unidecode(t) if unicodedata.category(t)[0] == 'P' else t for t in text])


Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 22.4 MB/s eta 0:00:01[K     |██▉                             | 20 kB 20.9 MB/s eta 0:00:01[K     |████▏                           | 30 kB 10.9 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 8.8 MB/s eta 0:00:01[K     |███████                         | 51 kB 4.9 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 5.3 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 5.7 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 6.4 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 6.6 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 5.2 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 5.2 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 5.2 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 5.2 MB/s eta 0:00:01

In [151]:
def replace_multi_occurrences(text, filler):
    """Replaces multiple occurrences of filler with n filler"""
    # only run if we have multiple occurrences of filler
    if text.count(filler) <= 1:
        return text
    # pad fillers with whitespace
    text = text.replace(f'{filler}', f' {filler} ')
    # remove introduced duplicate whitespaces
    text = ' '.join(text.split())
    # find indices of occurrences
    indices = []
    for m in re.finditer(r'{}'.format(filler), text):
        index = m.start()
        indices.append(index)
    # collect merge list
    merge_list = []
    for i, index in enumerate(indices):
        if i > 0 and index - old_index == len(filler) + 1:
            # found two consecutive fillers
            if len(merge_list) > 0 and merge_list[-1][1] == old_index:
                # extend previous item
                merge_list[-1][1] = index
                merge_list[-1][2] += 1
            else:
                # create new item
                merge_list.append([old_index, index, 2])
        old_index = index
    # merge occurrences
    if len(merge_list) > 0:
        new_text = ''
        pos = 0
        for (start, end, count) in merge_list:
            new_text += text[pos:start]
            new_text += f'{count} {filler}'
            pos = end + len(filler)
        new_text += text[pos:]
        text = new_text
    return text

In [155]:
def remove_unicode_symbols(text):
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'So')
    return text

In [162]:
def remove_accented_characters(text):
    text = unidecode.unidecode(text)
    return text


In [163]:
def preprocess_bert(text, args, do_lower_case=True):
    """Preprocesses tweet for BERT"""
    # standardize
    text = standardize_text(text)
    # replace usernames/urls
    if args.replace_usernames:
        text = replace_usernames(text, filler=args.username_filler)
    if args.replace_urls:
        text = replace_urls(text, filler=args.url_filler)
    if args.asciify_emojis:
        text = asciify_emojis(text)
    if args.standardize_punctuation:
        text = standardize_punctuation(text)
    if do_lower_case:
        text = text.lower()
    if args.replace_multiple_usernames:
        text = replace_multi_occurrences(text, args.username_filler)
    if args.replace_multiple_urls:
        text = replace_multi_occurrences(text, args.url_filler)
    if args.remove_unicode_symbols:
        text = remove_unicode_symbols(text)
    if args.remove_accented_characters:
        text = remove_accented_characters(text)
    return text

In [1]:
import tensorflow as tf

In [2]:
#https://github.com/tensorflow/models/tree/93490036e00f37ecbe6693b9ff4ae488bb8e9270/official
!git clone -b master https://github.com/tensorflow/models.git --depth=1


Cloning into 'models'...
remote: Enumerating objects: 3089, done.[K
remote: Counting objects: 100% (3089/3089), done.[K
remote: Compressing objects: 100% (2614/2614), done.[K
remote: Total 3089 (delta 775), reused 1383 (delta 434), pack-reused 0[K
Receiving objects: 100% (3089/3089), 33.34 MiB | 20.37 MiB/s, done.
Resolving deltas: 100% (775/775), done.


In [3]:
import sys
sys.path.insert(0,'models')

In [5]:
!pip install sentencepiece
from official.nlp.bert import tokenization
from official.nlp.data.classifier_data_lib import DataProcessor, generate_tf_record_from_data_file, InputExample

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 3.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [6]:
def generate_tfrecords(args, dataset_dir, labels):
    """Generates tfrecords from generated tsv files"""
    processor = TextClassificationProcessor(labels)
    # save label mapping
    processor.save_label_mapping(dataset_dir)
    # get tokenizer
    tokenizer = get_tokenizer(args.model_class)
    processor_text_fn = tokenization.convert_to_unicode
    # generate tfrecords
    input_dir = os.path.join(dataset_dir, 'preprocessed')
    output_dir = os.path.join(dataset_dir, 'tfrecords')
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    input_meta_data = generate_tf_record_from_data_file(
        processor,
        input_dir,
        tokenizer,
        train_data_output_path=os.path.join(output_dir, 'train.tfrecords'),
        eval_data_output_path=os.path.join(output_dir, 'dev.tfrecords'),
        max_seq_length=args.max_seq_length)
    with tf.io.gfile.GFile(os.path.join(dataset_dir, 'meta.json'), 'w') as writer:
        writer.write(json.dumps(input_meta_data, indent=4) + '\n')
    logger.info(f'Sucessfully wrote tfrecord files to {output_dir}')


In [7]:
labels = set()
for _type in ['train', 'dev']:
    f_name = f'{_type}.tsv'
    logger.info(f'Reading data for for type {_type}...')
    f_path = os.path.join(originals_dir, dataset, f_name)
    if not os.path.isfile(f_path):
        logger.info(f'Could not find file {f_path}. Skipping.')
        continue
        df = pd.read_csv(f_path, usecols=REQUIRED_COLUMNS, sep='\t')
        logger.info('Creating preprocessed files...')
        df.loc[:, 'text'] = df.text.apply(preprocess_bert, args=(args, do_lower_case))
        df.to_csv(os.path.join(preprocessed_folder, f_name), columns=REQUIRED_COLUMNS, header=False, index=False, sep='\t')
        # add labels
        labels.update(df.label.unique().tolist())
    logger.info('Creating tfrecords files...')
    # we sort the labels alphabetically in order to maintain consistent label ids
    labels = sorted(list(labels))
    dataset_dir = os.path.join(run_dir, dataset)
    generate_tfrecords(args, dataset_dir, labels)
    # saving config
f_path_config = os.path.join(run_dir, 'create_finetune_config.json')
logger.info(f'Saving config to {f_path_config}')
save_to_json(vars(args), f_path_config)

NameError: ignored

In [165]:
df.text.apply(preprocess_bert, args=(args, do_lower_case))

  del sys.path[0]


0       disneyland is spreading measles 121 cases in 1...
1       california measles outbreak: 123 cases, latest...
2       why isn't the news talking about how measles w...
3       no link to autism from vaccines, says expert: ...
4       we need a twitteruser assembly at my kids' sch...
                              ...                        
8995    ((elb)) combo vaccine causes autism: my son el...
8996    ((silvito el libre)) combo vaccine causes auti...
8997                  vaccinate your kids for prevention!
8998    twitteruser yeah. i think they're leaning towa...
8999    guns don't kill people. vaccinations for deadl...
Name: text, Length: 9000, dtype: object