### Install Library

In [None]:
!pip install -q -U watermark

In [None]:
!pip install transformers==2.11.0

In [None]:
!pip install columnize

In [None]:
!pip install Keras

In [None]:
!pip install tqdm

In [None]:
!pip install tensorflow

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers,tensorflow,keras,columnize,tqdm

### Import Library

In [None]:
import ast
from ast import literal_eval
import columnize
from collections import defaultdict
from textwrap import wrap
from tqdm import tqdm, trange
import tensorflow as tf
import emoji 

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import transformers
from transformers import *

import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score

### Common Configuration Set-Up

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 10
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### Exploratory Data Analysis

In [None]:
# df = pd.read_csv("data/tweet_emoji_dev_1000.csv")
df = pd.read_csv("data/tweet_emoji_train_40000.csv")
df.head()

In [None]:
df = df.drop('id', 1)
df = df.drop_duplicates(subset=['tweet'], keep=False)
df = df.dropna(how='any',axis=0) 
df.reset_index(inplace=True)
df.index.rename('id', inplace=True)
df = df.drop('index', 1)

In [None]:
print('Contains only unique tweets: ', df.tweet.nunique() == df.shape[0])
print('Contains Null values: ', df.isnull().values.any())

In [None]:
df['emoji_targets'] = df['emoji'].apply(lambda x: set(literal_eval(x)))

In [None]:
df.head()

In [None]:
mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df['emoji_targets']),
                index=df.index,
                columns=mlb.classes_))

In [None]:
cols = df.columns
label_cols = list(cols[3:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
# Label counts
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') 

In [None]:
df['one_hot_labels'] = list(df[label_cols].values)

In [None]:
df.head()

In [None]:
# helper functions to get the distinct emoji in the dataset
def get_distinct_emoji(emoji_column:pd.core.series.Series)->set:
    emoji_set = set()
    for emojis in emoji_column:
        for x in emojis: 
            emoji_set.add(x)
    return emoji_set

def print_distinct_emoji_in_dataset(emoji_set:set):
    print('Total distinct count of Emojis in dataset is : {}'.format(len(emoji_set)))
    emoji_set_formatted = [emoji.emojize(':'+ e +':') for e in emoji_set]
    print(columnize.columnize(emoji_set_formatted, displaywidth=20))
        
emoji_class = get_distinct_emoji(df.emoji_targets)
print_distinct_emoji_in_dataset(emoji_class)

In [None]:
labels = list(df.one_hot_labels.values)
tweets = list(df.tweet.values)

### Data Pre-processing

* Add special tokens to seperate setences and do classification 
* Pass sequences of constant length using padding 
* Create array of 0s and 1s called attention mask 

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased' # case sensitive

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
sample_txt = 'ranks 6th in January Idol Group Brand Reputation 1Keep using 2Search GOT 7 on Naver htt'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
tokenizer.sep_token, tokenizer.sep_token_id # special token '[SEP]' is the marker for ending of a sentence 

In [None]:
tokenizer.cls_token, tokenizer.cls_token_id # special token '[CLS]' is the marker for the start of each sentence 

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id # special token '[PAD]' is the marker for padding

In [None]:
tokenizer.unk_token, tokenizer.unk_token_id # special token '[UNK]' for vocabulary that is not in the training set

In [None]:
token_lens = []

for txt in df.tweet:
    tokens = tokenizer.encode(txt, max_length=200)
    token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlabel('Token count');

In [None]:
MAX_LEN = 60

In [None]:
encodings = tokenizer.batch_encode_plus(tweets,max_length=MAX_LEN,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

In [None]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [None]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

In [None]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

In [None]:
# Use train_test_split to split our data into train and validation sets

train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks \
= train_test_split(input_ids, labels, token_type_ids,attention_masks, random_state=2020, test_size=0.10, stratify = labels)

# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
torch.save(validation_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

### Load Model & Set Params

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)

### Train Model

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    
    model.train()
    tr_loss = 0 
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        optimizer.zero_grad()
        
        # Forward pass for multilabel classification
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        loss_func = BCEWithLogitsLoss() 
        loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        train_loss_set.append(loss.item())    
        
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    model.eval()
    logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]
    
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        with torch.no_grad():
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)
            
            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()
            
        tokenized_texts.append(b_input_ids)
        logit_preds.append(b_logit_pred)
        true_labels.append(b_labels)
        pred_labels.append(pred_label)
        
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]
    
    threshold = 0.50
    pred_bools = [pl>threshold for pl in pred_labels]
    true_bools = [tl==1 for tl in true_labels]
    val_f1_score = f1_score(true_bools,pred_bools,average='micro') 
    val_accuracy_score = accuracy_score(true_bools, pred_bools)
    print('F1 Score: ', val_f1_score)
    print('Accuracy Score: ', val_accuracy_score)

In [None]:
torch.save(model.state_dict(), 'bert_model_emoji_classifier')