<a href="https://colab.research.google.com/github/areias/bert_covid_sentiment/blob/main/bert_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
!pip install transformers



In [67]:
from transformers import (
   AutoConfig,
   AutoTokenizer,
   TFAutoModelForSequenceClassification,
   AdamW,
   glue_convert_examples_to_features
)
import tensorflow as tf
import tensorflow_datasets as tfds
import json

In [68]:
import pandas as pd

In [69]:
df=pd.read_csv("train-4.csv")

In [70]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,label
0,0,5.608318e+17,@GMA @GStephanopoulos #PalmDesert high school ...,0
1,1,7.72043e+17,SB121 [Passed] Meningococcal disease-pupils to...,0
2,2,5.262292e+17,@cabosetv @EvilGeniuses @EGiNcontroL @Razer th...,0
3,3,4.410094e+17,“@MizzTwerksum: All natural 😋🙌 squats not shot...,0
4,4,4.542856e+17,#travel #jobs Travel Immunization Nurse Specia...,0


In [71]:
df.label.value_counts()

 1    1134
-1    1134
 0    1134
Name: label, dtype: int64

In [72]:
# id,label,text
df=df.loc[:,["tweet_id","label", "text"]]
df.columns=['id','label', 'text']

In [73]:
import numpy as np

In [74]:
"""
    60% - train set,
    20% - dev/validation set,
    20% - test set"""

train, dev, test = np.split(df.sample(frac=1, random_state=42), 
                       [int(.6*len(df)), int(.8*len(df))])

In [75]:
dev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 680 entries, 2111 to 1140
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      680 non-null    float64
 1   label   680 non-null    int64  
 2   text    680 non-null    object 
dtypes: float64(1), int64(1), object(1)
memory usage: 21.2+ KB


In [76]:
import os 
#os.makedirs("data/finetune/originals/crowdbreaks")
#os.makedirs("vocabs")


In [77]:
train.to_csv("data/finetune/originals/crowdbreaks/train.tsv", sep="\t",index=False)
dev.to_csv("data/finetune/originals/crowdbreaks/dev.tsv", sep="\t",index=False)
test.to_csv("data/finetune/originals/crowdbreaks/test.tsv", sep="\t",index=False)


In [78]:
# from https://github.com/digitalepidemiologylab/covid-twitter-bert/blob/master/preprocess/create_finetune_data.py

In [79]:
# args
"""
cd preprocess
python create_finetune_data.py \
  --run_prefix test_run \
  --finetune_datasets <dataset_name> \
  --model_class bert_large_uncased_wwm \
  --max_seq_length 96 \
  --asciify_emojis \
  --username_filler twitteruser \
  --url_filler twitterurl \
  --replace_multiple_usernames \
  --replace_multiple_urls \
  --remove_unicode_symbols"
"""

args = {'run_prefix': "test_run",
'finetune_datasets' : ["crowdbreaks"],
'model_class' : "covid-twitter-bert-2",
'max_seq_length' : 96,
'asciify_emojis' : True,
'username_filler' : "twitteruser",
'url_filler' : "twitterurl", 
'replace_multiple_usernames' : True,
'replace_multiple_urls' : True,
'remove_unicode_symbols' : True}


In [164]:
from collections import namedtuple
arguments = namedtuple('arguments', ['run_prefix','finetune_datasets','model_class',
                                     'max_seq_length', 'asciify_emojis','username_filler',
                                    'url_filler', 'replace_multiple_usernames','replace_multiple_urls',
                                      'remove_unicode_symbols','replace_usernames','replace_urls',
                                     'standardize_punctuation','remove_accented_characters'])

args = arguments("test_run",["crowdbreaks"],"covid-twitter-bert-2",
                 96, True, "twitteruser", 
                 "twitterurl", True,True,
                 True, True, True,
                 True, True)
args

arguments(run_prefix='test_run', finetune_datasets=['crowdbreaks'], model_class='covid-twitter-bert-2', max_seq_length=96, asciify_emojis=True, username_filler='twitteruser', url_filler='twitterurl', replace_multiple_usernames=True, replace_multiple_urls=True, remove_unicode_symbols=True, replace_usernames=True, replace_urls=True, standardize_punctuation=True, remove_accented_characters=True)

In [165]:
REQUIRED_COLUMNS = ['id', 'label', 'text']
DATA_DIR = os.path.join('data')
VOCAB_PATH = os.path.join('vocabs')


In [166]:
import datetime
def get_run_name(args):
    # Use timestamp to generate a unique run name
    ts = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S_%f')
    if args.run_prefix:
        run_name = f'run_{ts}_{args.run_prefix}'
    else:
        run_name = f'run_{ts}'
    return run_name

In [167]:
run_name = get_run_name(args)
run_name

'run_2021-11-17_22-44-57_771405_test_run'

In [168]:
run_dir = os.path.join(DATA_DIR, 'finetune', run_name)
run_dir

'data/finetune/run_2021-11-17_22-44-57_771405_test_run'

In [169]:
if not os.path.isdir(run_dir):
  os.makedirs(run_dir)

In [170]:
# find input data
originals_dir = os.path.join(DATA_DIR, 'finetune', 'originals')
originals_dir   

'data/finetune/originals'

In [171]:
if args.finetune_datasets is None or len(args.finetune_datasets) == 0:
    finetune_datasets = os.listdir(originals_dir)
else:
    finetune_datasets = args.finetune_datasets
finetune_datasets

['crowdbreaks']

In [172]:
# Pretrained models configuration, add model configuration here

PRETRAINED_MODELS = {
        'bert_large_uncased': {
            'bucket_location': 'pretrained_models/bert/keras_bert/uncased_L-24_H-1024_A-16',
            'hub_url': 'tensorflow/bert_en_uncased_L-24_H-1024_A-16/2',
            'config': 'bert_config_large_uncased.json',
            'is_tfhub_model': True,
            'vocab_file': 'bert-large-uncased-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': False
            },
        'bert_multi_cased': {
            'bucket_location': 'pretrained_models/bert/keras_bert/multi_cased_L-12_H-768_A-12',
            'hub_url': 'tensorflow/bert_multi_cased_L-12_H-768_A-12/2',
            'config': 'bert_config_multi_cased.json',
            'is_tfhub_model': True,
            'vocab_file': 'bert-multi-cased-vocab.txt',
            'lower_case': False,
            'do_whole_word_masking': False
            },
        'bert_large_uncased_wwm': {
            'bucket_location': 'pretrained_models/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16',
            'hub_url': 'tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/2',
            'config': 'bert_config_large_uncased_wwm.json',
            'is_tfhub_model': True,
            'vocab_file': 'bert-large-uncased-whole-word-masking-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': True
            },
        'covid-twitter-bert': {
            'hub_url': 'digitalepidemiologylab/covid-twitter-bert/1',
            'is_tfhub_model': True,
            'config': 'bert_config_covid_twitter_bert.json',
            'vocab_file': 'bert-large-uncased-whole-word-masking-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': True
            },
        'covid-twitter-bert-2': {
            'hub_url': 'digitalepidemiologylab/covid-twitter-bert/2',
            'is_tfhub_model': True,
            'config': 'bert_config_covid_twitter_bert.json',
            'vocab_file': 'bert-large-uncased-whole-word-masking-vocab.txt',
            'lower_case': True,
            'do_whole_word_masking': True
            }
        }

In [173]:
# https://tfhub.dev/digitalepidemiologylab/covid-twitter-bert/2




In [174]:
do_lower_case = PRETRAINED_MODELS[args.model_class]['lower_case']
do_lower_case


True

In [175]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)-5.5s] [%(name)-12.12s]: %(message)s')

for dataset in finetune_datasets:
      logger.info(f'Processing dataset {dataset}...')
      preprocessed_folder = os.path.join(run_dir, dataset, 'preprocessed')
      if not os.path.isdir(preprocessed_folder):
          os.makedirs(preprocessed_folder)


2021-11-17 22:44:57,907 [INFO ] [__main__    ]: Processing dataset crowdbreaks...


In [176]:
# compile regexes
import re
from html.parser import HTMLParser
import unicodedata

username_regex = re.compile(r'(^|[^@\w])@(\w{1,15})\b')
url_regex = re.compile(r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))')
control_char_regex = re.compile(r'[\r\n\t]+')
# translate table for punctuation
transl_table = dict([(ord(x), ord(y)) for x, y in zip(u"‘’´“”–-",  u"'''\"\"--")])
# HTML parser
html_parser = HTMLParser()

In [177]:
def standardize_text(text):
    """
    1) Escape HTML
    2) Replaces some non-standard punctuation with standard versions. 
    3) Replace \r, \n and \t with white spaces
    4) Removes all other control characters and the NULL byte
    5) Removes duplicate white spaces
    """
    html_parser = HTMLParser()

    # escape HTML symbols
    text = html_parser.unescape(text)
    # standardize punctuation
    text = text.translate(transl_table)
    text = text.replace('…', '...')
    # replace \t, \n and \r characters by a whitespace
    text = re.sub(control_char_regex, ' ', text)
    # remove all remaining control characters
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')
    # replace multiple spaces with single space
    text = ' '.join(text.split())
    return text.strip()

In [178]:
def replace_usernames(text, filler='user'):
    # @<user> is a marker used internally. use filler instead
    text = text.replace('@<user>', f'{filler}')
    # replace other user handles by filler
    text = re.sub(username_regex, filler, text)
    # add spaces between, and remove double spaces again
    text = text.replace(filler, f' {filler} ')
    text = ' '.join(text.split())
    return text

In [179]:
def replace_urls(text, filler='url'):
    # <url> is a marker used internally. use filler instead
    text = text.replace('<url>', filler)
    # replace other urls by filler
    text = re.sub(url_regex, filler, text)
    # add spaces between, and remove double spaces again
    text = text.replace(filler, f' {filler} ')
    text = ' '.join(text.split())
    return text

In [180]:
!pip install emoji
import emoji
def asciify_emojis(text):
    """
    Converts emojis into text aliases. E.g. 👍 becomes :thumbs_up:
    For a full list of text aliases see: https://www.webfx.com/tools/emoji-cheat-sheet/
    """
    text = emoji.demojize(text)
    return text



In [181]:
!pip install unidecode 
import unidecode
def standardize_punctuation(text):
    return ''.join([unidecode.unidecode(t) if unicodedata.category(t)[0] == 'P' else t for t in text])




In [182]:

def replace_multi_occurrences(text, filler):
    """Replaces multiple occurrences of filler with n filler"""
    # only run if we have multiple occurrences of filler
    if text.count(filler) <= 1:
        return text
    # pad fillers with whitespace
    text = text.replace(f'{filler}', f' {filler} ')
    # remove introduced duplicate whitespaces
    text = ' '.join(text.split())
    # find indices of occurrences
    indices = []
    for m in re.finditer(r'{}'.format(filler), text):
        index = m.start()
        indices.append(index)
    # collect merge list
    merge_list = []
    for i, index in enumerate(indices):
        if i > 0 and index - old_index == len(filler) + 1:
            # found two consecutive fillers
            if len(merge_list) > 0 and merge_list[-1][1] == old_index:
                # extend previous item
                merge_list[-1][1] = index
                merge_list[-1][2] += 1
            else:
                # create new item
                merge_list.append([old_index, index, 2])
        old_index = index
    # merge occurrences
    if len(merge_list) > 0:
        new_text = ''
        pos = 0
        for (start, end, count) in merge_list:
            new_text += text[pos:start]
            new_text += f'{count} {filler}'
            pos = end + len(filler)
        new_text += text[pos:]
        text = new_text
    return text

In [183]:
def remove_unicode_symbols(text):
    text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'So')
    return text


In [187]:
def remove_accented_characters(text):
    text = unidecode.unidecode(text)
    return text

In [188]:
def preprocess_bert(text, args, do_lower_case=True):
    """Preprocesses tweet for BERT"""
    # standardize
    text = standardize_text(text)
    # replace usernames/urls
    if args.replace_usernames:
        text = replace_usernames(text, filler=args.username_filler)
    if args.replace_urls:
        text = replace_urls(text, filler=args.url_filler)
    if args.asciify_emojis:
        text = asciify_emojis(text)
    if args.standardize_punctuation:
        text = standardize_punctuation(text)
    if do_lower_case:
        text = text.lower()
    if args.replace_multiple_usernames:
        text = replace_multi_occurrences(text, args.username_filler)
    if args.replace_multiple_urls:
        text = replace_multi_occurrences(text, args.url_filler)
    if args.remove_unicode_symbols:
        text = remove_unicode_symbols(text)
    if args.remove_accented_characters:
        text = remove_accented_characters(text)
    return text

In [189]:
labels = set()
for _type in ['train', 'dev']:
    f_name = f'{_type}.tsv'
    logger.info(f'Reading data for for type {_type}...')
    f_path = os.path.join(originals_dir, dataset, f_name)
    if not os.path.isfile(f_path):
        logger.info(f'Could not find file {f_path}. Skipping.')
        continue
        df = pd.read_csv(f_path, usecols=REQUIRED_COLUMNS, sep='\t')
        logger.info('Creating preprocessed files...')
        df.loc[:, 'text'] = df.text.apply(preprocess_bert, args=(args, do_lower_case))
        df.to_csv(os.path.join(preprocessed_folder, f_name), columns=REQUIRED_COLUMNS, header=False, index=False, sep='\t')
        # add labels
        labels.update(df.label.unique().tolist())
    #logger.info('Creating tfrecords files...')
    

2021-11-17 22:45:19,958 [INFO ] [__main__    ]: Reading data for for type train...
2021-11-17 22:45:19,960 [INFO ] [__main__    ]: Reading data for for type dev...


In [190]:

df.loc[:, 'text'] = df.text.apply(preprocess_bert, args=(args, do_lower_case))


  if sys.path[0] == '':


In [191]:
df.head()

Unnamed: 0,id,label,text
0,5.608318e+17,0,2 twitteruser #palmdesert high school in #rive...
1,7.72043e+17,0,sb121 [passed] meningococcal disease-pupils to...
2,5.262292e+17,0,4 twitteruser the keyboard is sweet. i have th...
3,4.410094e+17,0,twitteruser : all natural :face_savoring_food:...
4,4.542856e+17,0,#travel #jobs travel immunization nurse specia...
