In [1]:
import os
from pathlib import Path
os.sys.path.append(str(Path('../')))

In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob, Word
from tqdm.notebook import tqdm
from tqdm.keras import TqdmCallback
from ftfy import fix_text
import re
import warnings
import random
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

import tensorflow as tf
from tensorflow import keras
from utils import get_tokenizer
from models.roberta import get_roberta_for_skep
from custom_callbacks.warmup_cosine_decay import WarmUpCosineDecayScheduler
from config import Config

In [3]:
pd.options.display.max_colwidth = None
warnings.filterwarnings('ignore')
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
EMAIL_REGEX = re.compile(
    r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
    flags=re.IGNORECASE | re.UNICODE,
)

PHONE_REGEX = re.compile(
    r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"
)

MULTI_WHITESPACE_TO_ONE_REGEX = re.compile(r"\s+")

URL_REGEX = re.compile(
    r"(?:^|(?<![\w\/\.]))"
    # protocol identifier
    # r"(?:(?:https?|ftp)://)"  <-- alt?
    r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
    # user:pass authentication
    r"(?:\S+(?::\S*)?@)?" r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    # host name
    r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)"
    # domain name
    r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
    # TLD identifier
    r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r")"
    # port number
    r"(?::\d{2,5})?"
    # resource path
    r"(?:\/[^\)\]\}\s]*)?",
    # r"(?:$|(?![\w?!+&\/\)]))",
    # @jfilter: I removed the line above from the regex because I don't understand what it is used for, maybe it was useful?
    # But I made sure that it does not include ), ] and } in the URL.
    flags=re.UNICODE | re.IGNORECASE,
)

strange_double_quotes = [
    "«",
    "‹",
    "»",
    "›",
    "„",
    "“",
    "‟",
    "”",
    "❝",
    "❞",
    "❮",
    "❯",
    "〝",
    "〞",
    "〟",
    "＂",
]
strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]

DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
SINGLE_QUOTE_REGEX = re.compile("|".join(strange_single_quotes))
HASHTAG_REGEX = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
MENTION_REGEX = re.compile("(?:^|\s)[＠ @]{1}([^\s#<>[\]|{}]+)", re.UNICODE)

In [5]:
def fix_strange_quotes(text):
    text = SINGLE_QUOTE_REGEX.sub("'", text)
    text = DOUBLE_QUOTE_REGEX.sub('"', text)
    return text

def normalize_whitespace(text):
    text = MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
    return text.strip()

def replace_urls(text, replace_with="<URL>"):
    return URL_REGEX.sub(replace_with, text)

def replace_emails(text, replace_with="<EMAIL>"):
    return EMAIL_REGEX.sub(replace_with, text)

def replace_phone_numbers(text, replace_with="<PHONE>"):
    return PHONE_REGEX.sub(replace_with, text)

def replace_hashtag(text, replace_with=''):
    return HASHTAG_REGEX.sub(replace_with, text)

def replace_mentions(text, replace_with=''):
    return MENTION_REGEX.sub(replace_with, text)

def clean_text_for_G(text):
    text = str(text)
    text = fix_text(text)
    text = fix_strange_quotes(text)
    text = replace_urls(text, replace_with='')
    text = replace_emails(text, replace_with='')
    text = replace_phone_numbers(text, replace_with='')
    text = replace_hashtag(text)
    text = replace_mentions(text)
    text = remove_stopwords(text)
    text = normalize_whitespace(text)
    text = strip_punctuation(text)
    return text.lower()

def clean_text(text):
    text = str(text)
    text = fix_text(text)
    text = fix_strange_quotes(text)
    text = replace_hashtag(text)
    text = replace_mentions(text)
    text = normalize_whitespace(text)
    return text.lower()

In [6]:
tweet_data_paths = [
    Path('../../full-corpus.csv'), 
    Path('../../TextEmotion.csv'), 
    Path('../data/test.csv'),
    Path('../data/train.csv'),
    Path('../data/validation.csv')
]
data_columns = [
    'TweetText',
    'content',
    'text',
    'text',
    'text'
]

In [7]:
processed_tweets = []
for i, data_path in enumerate(tqdm(tweet_data_paths)):
    df: pd.DataFrame = pd.read_csv(data_path)
    df.dropna(inplace=True)
    tqdm.pandas(desc=f'Cleaning {data_path.name}')
    df[data_columns[i]] = df[data_columns[i]].progress_apply(lambda x: clean_text_for_G(x))
    processed_tweets += df[data_columns[i]].tolist()

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Cleaning full-corpus.csv', max=5113.0, style=ProgressStyl…




HBox(children=(FloatProgress(value=0.0, description='Cleaning TextEmotion.csv', max=40000.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Cleaning test.csv', max=3534.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Cleaning train.csv', max=21983.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Cleaning validation.csv', max=5497.0, style=ProgressStyle…





In [8]:
print(processed_tweets[:5])

['now swype iphone crack  iphone', 'adding carrier support iphone 4s  just announced ', 'hilarious video   guy duet  s siri  pretty sums love affair ', 'easy switch iphone  see ya ', 'i realized reason i got twitter ios5 thanks']


In [9]:
vocab = []
vocab = list(set([t for pt in processed_tweets for t in pt.split()]))
vocab[:10], len(vocab)

(['beds',
  'carnisada',
  'rejection',
  'yrold',
  'marijuana',
  'beat',
  'okasan',
  'cuz',
  'retailer',
  'wut'],
 35726)

In [10]:
G = {}
for tweet in tqdm(processed_tweets):
    blob = TextBlob(tweet)
    assessments = blob.sentiment_assessments.assessments
    for assessment in assessments:
        p = assessment[1]
        if p > 0:
            p = 'positive'
        elif p < 0:
            p = 'negative'
        else:
            p = 'neutral'
        w = Word(assessment[0][0])
        w = w.lemmatize()
        G[w] = p

HBox(children=(FloatProgress(value=0.0, max=76127.0), HTML(value='')))




In [11]:
processed_tweets = []
for i, data_path in enumerate(tqdm(tweet_data_paths)):
    df: pd.DataFrame = pd.read_csv(data_path)
    df.dropna(inplace=True)
    tqdm.pandas(desc=f'Cleaning {data_path.name}')
    df[data_columns[i]] = df[data_columns[i]].progress_apply(lambda x: clean_text(x))
    processed_tweets += df[data_columns[i]].tolist()

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Cleaning full-corpus.csv', max=5113.0, style=ProgressStyl…




HBox(children=(FloatProgress(value=0.0, description='Cleaning TextEmotion.csv', max=40000.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Cleaning test.csv', max=3534.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Cleaning train.csv', max=21983.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Cleaning validation.csv', max=5497.0, style=ProgressStyle…





In [12]:
texts = []
sentiment_words = []
sentiment_polarities = []
for tweet in tqdm(processed_tweets):
    words = tweet.split()
    if len(words) > 2: 
        for i, word in enumerate(words):
            try:
                sentiment = G[Word(word).lemmatize()]
                words[i] = '<mask>'
                texts.append(' '.join(words))
                sentiment_words.append(word)
                sentiment_polarities.append(sentiment)
                words[i] = word
            except KeyError:
                continue
skep_data = pd.DataFrame({
    'text': texts,
    'word': sentiment_words,
    'polarity': sentiment_polarities
})
skep_data = skep_data.sample(frac=1).reset_index(drop=True)
skep_data

HBox(children=(FloatProgress(value=0.0, max=76127.0), HTML(value='')))




Unnamed: 0,text,word,polarity
0,i'm so getting the <mask>,cold,negative
1,<mask> finished my marketing project only took me 7 hours..now just hanging out and relaxing,finally,negative
2,meaning to email you for months. your pre-reunion party must be coming up soon. <mask> wish i could be there. >140 char soon,really,positive
3,"an iphone app came out a <mask> months back called zemote, bumped my domain zemote.com out of the spot",few,negative
4,im an <mask> fan of **** magazine and i love your magazines,avid,positive
...,...,...,...
91614,<mask> it's still not the same going to have a look though.,awww,positive
91615,wow.. tomorrow and then it's over. i'll never see some of those people again. it's <mask> of sad.,kind,positive
91616,"winding down, <mask> having a low key day.",love,positive
91617,hey and this is <mask> dumb. http://t.co/kubkpo0t,pretty,positive


In [13]:
vocab = list(set(skep_data.word.tolist() + random.sample(vocab, 500)))
vocab_size = len(vocab)
vocab_size

1711

In [14]:
word_2_idx = {v: i for i, v in enumerate(vocab)}
sentiment_2_idx = {'neutral': 0, 'positive': 1, 'negative': 2}

In [15]:
y1 = [word_2_idx[w] for w in skep_data.word.tolist()]
y2 = [sentiment_2_idx[s] for s in skep_data.polarity.tolist()]
y1[:5], y2[:5]

([347, 1631, 1515, 1111, 884], [2, 2, 1, 2, 1])

In [16]:
y1 = keras.utils.to_categorical(y1, num_classes=vocab_size)
y2 = keras.utils.to_categorical(y2, num_classes=3)
y1[:5], y1.shape, y2[:5], y2.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 (91619, 1711),
 array([[0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.]], dtype=float32),
 (91619, 3))

In [17]:
tokenizer = get_tokenizer('roberta')
mask_token_id = tokenizer.get_vocab()['<mask>']
MAX_LEN = 96

In [18]:
num_text = skep_data.shape[0]
input_ids = np.ones((num_text, MAX_LEN),dtype='int32')
attention_mask = np.zeros((num_text, MAX_LEN),dtype='int32')
token_type_ids = np.zeros((num_text, MAX_LEN),dtype='int32')
for i, text in enumerate(tqdm(skep_data.text.tolist())):
    text = ' ' + ' '.join(text.split())
    texts = text.split('<mask>')
    enc = [0] + tokenizer.encode(texts[0]).ids + [mask_token_id] + tokenizer.encode(texts[1]).ids + [2]
    input_ids[i, :len(enc)] = enc
    attention_mask[i, :len(enc)] = 1
input_ids[:3], input_ids.shape

HBox(children=(FloatProgress(value=0.0, max=91619.0), HTML(value='')))




(array([[    0,   939,   437,    98,   562,     5,  1437, 50264,  1437,
             2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1],
        [    0,  1437, 50264,  1550,   127,  2474,   695,   129,   362,
           162,   262,   722,  7586,  8310,    95,  7209,    66,     8,
         19448,     2,     1,     1,     1,     1,     1,     1,     1,
            

In [19]:
model = get_roberta_for_skep(vocab_size, 5e-5)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ids (InputLayer)                [(None, 96)]         0                                            
__________________________________________________________________________________________________
att (InputLayer)                [(None, 96)]         0                                            
__________________________________________________________________________________________________
tti (InputLayer)                [(None, 96)]         0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 96, 768), (N 124645632   ids[0][0]                        
______________________________________________________________________________________________

In [20]:
model.load_weights(str(Config.Train.checkpoint_dir / f'skep/weights.h5'))

In [21]:
cbs = [
    # keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1, factor=0.3),
#     WarmUpCosineDecayScheduler(6e-5, 1500, warmup_steps=300, hold_base_rate_steps=200, verbose=0),
    keras.callbacks.EarlyStopping(patience=2, verbose=1, restore_best_weights=True, baseline=2.72400),
    keras.callbacks.ModelCheckpoint(
        str(Config.Train.checkpoint_dir / f'skep/weights.h5'),
        verbose=1, save_best_only=True, save_weights_only=True),
    TqdmCallback()
]
model.fit([input_ids, attention_mask, token_type_ids], [y1, y2], epochs=50, verbose=0, 
          validation_split=0.2, callbacks=cbs, initial_epoch=10)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch 00011: val_loss improved from inf to 2.70547, saving model to ..\checkpoints\skep\weights.h5

Epoch 00012: val_loss improved from 2.70547 to 2.69304, saving model to ..\checkpoints\skep\weights.h5

Epoch 00013: val_loss did not improve from 2.69304
Restoring model weights from the end of the best epoch.

Epoch 00014: val_loss did not improve from 2.69304
Epoch 00014: early stopping



<tensorflow.python.keras.callbacks.History at 0x1f76c452a48>