In [None]:
"""%pip install transformers -q
%pip install sklearn -q
%pip install tensorflow_addons -q"""


import tensorflow as tf
from transformers import TFBertForSequenceClassification, TFBertModel, AutoTokenizer
from datetime import datetime
import numpy as np
import os
from sklearn.model_selection import train_test_split
import tensorflow_addons as tfa
import requests


gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
session = tf.compat.v1.InteractiveSession(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))


# resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
# tf.config.experimental_connect_to_cluster(resolver)
# tf.tpu.experimental.initialize_tpu_system(resolver)
# strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1")
sentences_per_class = []
texts, labels, files = [], [], ['combinedneg.txt', 'combinedneut2.txt', 'combinedpos.txt']
for i, file in enumerate(files):
    r = requests.get(f"https://raw.githubusercontent.com/aaposyvanen/emodim/master/data/tr/{file}")
    texts.extend(r.text.split('\n'))
    labels.extend([i]*len(r.text.split('\n')))
    sentences_per_class.append(len(r.text.split('\n')))
print(f"Negative sentences:\t{sentences_per_class[0]},\nNeutral sentences:\t{sentences_per_class[1]},\nPositive sentences:\t{sentences_per_class[2]}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1")
train_texts, x, train_labels, y = train_test_split(texts, labels, test_size=.2) # use 80 % to train
val_texts, test_texts, val_labels, test_labels = train_test_split(x, y, test_size=.5) # use 10 % to validate and 10 % to test
MAX_LEN = 75

# tokenize data to be in the form BERT understands, save input_ids and attention_masks (token_type_ids can be omitted in our use case since we'll process one sentence at a time)
tr = tokenizer.batch_encode_plus(train_texts, add_special_tokens=True ,return_attention_mask=True, return_token_type_ids=False, max_length=MAX_LEN, padding='max_length')
va = tokenizer.batch_encode_plus(val_texts, add_special_tokens=True ,return_attention_mask=True, return_token_type_ids=False, max_length=MAX_LEN, padding='max_length')
te = tokenizer.batch_encode_plus(test_texts, add_special_tokens=True ,return_attention_mask=True, return_token_type_ids=False, max_length=MAX_LEN, padding='max_length')
train = [np.array(tr["input_ids"]), np.array(tr["attention_mask"])]
validate = [np.array(va["input_ids"]), np.array(va["attention_mask"])]
test = [(np.array(te["input_ids"])), np.array(te["attention_mask"])]

In [None]:
def create_model(bert_model, max_len=MAX_LEN):
    input_ids = tf.keras.Input(shape=(max_len,), dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,), dtype='int32')
    embeddings = bert_model.bert([input_ids, attention_masks])[0]
    X = tf.keras.layers.Dropout(0.2)(embeddings)
    X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.2, return_sequences=True))(X)
    X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2))(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    X = tf.keras.layers.Dense(64, activation='gelu')(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    output = tf.keras.layers.Dense(3, activation="softmax")(X)
    model = tf.keras.models.Model(inputs = [input_ids, attention_masks], outputs = output)    

    return model

In [None]:
# with strategy.scope():
bert_model = TFBertModel.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1")
model = create_model(bert_model, MAX_LEN)
model.compile(optimizer=tfa.optimizers.AdamW(learning_rate=5e-5, weight_decay=1e-8),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = False),
              metrics=tf.keras.metrics.SparseCategoricalAccuracy())
history = model.fit(train, np.array(train_labels),
                        validation_data=(validate, np.array(val_labels)),
                        callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True),
                        epochs=6, batch_size=32)
print(history.history)

In [None]:
model2 = TFBertForSequenceClassification.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1", num_labels=3)
model2.compile(optimizer=tfa.optimizers.AdamW(learning_rate=5e-5, weight_decay=1e-8),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
              metrics=tf.keras.metrics.SparseCategoricalAccuracy())
history2 = model2.fit(train, np.array(train_labels),
                        validation_data=(validate, np.array(val_labels)),
                        callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True),
                        epochs=6, batch_size=32)

print(history2.history)
"""
y_pred = model2.predict([test_input_ids, test_attention_masks])
print(y_pred)
print(tf.nn.softmax(y_pred['logits'], axis=0).numpy())
print(tf.nn.softmax(y_pred['logits'], axis=1).numpy())
y_pred = tf.nn.softmax(y_pred['logits'], axis=1).numpy()
print(y_pred)
y_pred_proba = [np.max(y_pred, axis=1)]
y_pred_label = np.array([tf.argmax(y_pred, axis=1).numpy()])

print("Confusion Matrix : ")
print(confusion_matrix(test_labels, y_pred_label[0]))
print("ROC AUC score : ", roc_auc_score(np.array(test_labels), y_pred, multi_class='ovo'))"""

In [None]:
y_pred = model.predict(test)
y_pred_proba = [np.max(y_pred, axis=1)]
y_pred_label = np.array([tf.argmax(y_pred, axis=1).numpy()])

In [None]:
# Evaluate the model
from sklearn.metrics import confusion_matrix, roc_auc_score

print(f"Confusion Matrix:\n{confusion_matrix(test_labels, y_pred_label[0])}")
print(f"ROC AUC score: {round(roc_auc_score(np.array(test_labels), y_pred, multi_class='ovo'), 3)}")

In [None]:
testset = "https://raw.githubusercontent.com/aaposyvanen/emodim/master/data/txts/s24_2017_sentences_shuffled_slice_clean.txt"
r = requests.get(testset)
tmp = r.text.split('\n')
stripped_testset = []
for t in tmp:
    stripped_testset.append(t.split('\t')[0])
t = tokenize(stripped_testset, MAX_LEN)

In [None]:
predictions = model.predict(t)

In [None]:
for i, p in enumerate(predictions[:100]):
    prob = np.max(p)
    predicted_class = tf.argmax(p)
    print(f'Probs:\t{p}, {predicted_class}\t {stripped_testset[i]}')

In [None]:
time = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
modelpath = f'E:\\Emodim\\data\\others\\fine_tuned_finBERT_tf_{time}\\1'
model.save(f"{modelpath}", overwrite=True)

In [None]:
import_model = tf.keras.models.load_model(f"{modelpath}")
# import_model.summary()

In [None]:
# test sentences from the nethate study
s = ["Ehdotontahan tuo desantti takamies kaipaisi. Sellaista vanhan ajan kuritushuonetta.",
    "Painuhan nyt trolli vittuun.kovat on jutu kun putinin kulli kutittelee persreikää... Ainut siedettävä ryssä on kuollut ryssä.paska kansa ja paskat johtajat jo usean sadan vuoden perinteellä.",
    "Saatanan lypsylehmä, jonka utareista ei maito heru. Teuraaksi kuuluisi. Ongelmana vain, että kukaan ei sitä jätettä söisi. Ei minkitkään.", 
    "Hyi vittu, Jokerimuija tuli vastaan Tinderissä", 
    "Kuvitteleeko nää oikeasti olevansa sex bätäng? No kai joku noitakin nussii..."]
sent = tokenizer.batch_encode_plus(s, add_special_tokens=True ,return_attention_mask=True, return_token_type_ids=False, max_length=MAX_LEN, padding='max_length')
sentences = [np.array(sent["input_ids"]), np.array(sent["attention_mask"])]
pred = import_model.predict(sentences)

for i, p in enumerate(pred):
    prob = np.max(p)
    predicted_class = tf.argmax(p)
    print(f'Probs:\t{p}, {predicted_class}\t {s[i]}')