In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub
import os
import time
import json
from pandas.core.frame import DataFrame
import re
import tensorflow_text as text
import emoji
from sklearn.utils import class_weight
from keras.callbacks import EarlyStopping
from transformers import AutoTokenizer
from tensorflow.keras.optimizers import Adam

In [4]:
def get_train_Data():
    train_ids = open("data/train.data.txt", "r")
    train_labels = open("data/train.label.txt", "r")
    train_data = []
    train_label = []
    for train_ids_str, label in zip(train_ids.readlines(), train_labels.readlines()):
        train_ids_list = train_ids_str.strip().split(",")
        temp_json_list = []
        if not os.path.exists("data/train_object/" + train_ids_list[0] + ".json"):
            continue
        for train_id in train_ids_list:
            train_path = "data/train_object/" + train_id + ".json"
            if os.path.exists(train_path):
                temp_json_list.append(json.load(open(train_path, "r")))
        # sort according to time
        temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], "%a %b %d %H:%M:%S +0000 %Y")))
        train_data.append(temp_json_list)
        train_label.append(0 if label.strip() == "nonrumour" else 1)
        temp_map = {"data": train_data, "label": train_label}
    return DataFrame(temp_map)

In [5]:
def get_dev_Data():
    dev_ids = open("data/dev.data.txt", "r")
    dev_labels = open("data/dev.label.txt", "r")
    dev_data = []
    dev_label = []
    for dev_ids_str, label in zip(dev_ids.readlines(), dev_labels.readlines()):
        dev_ids_list = dev_ids_str.strip().split(",")
        temp_json_list = []
        if not os.path.exists("data/dev_object/" + dev_ids_list[0] + ".json"):
            continue
        for dev_id in dev_ids_list:
            dev_path = "data/dev_object/" + dev_id + ".json"
            if os.path.exists(dev_path):
                temp_json_list.append(json.load(open(dev_path, "r")))
        # sort according to time
        temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], "%a %b %d %H:%M:%S +0000 %Y")))
        dev_data.append(temp_json_list)
        dev_label.append(0 if label.strip() == "nonrumour" else 1)
        temp_map = {"data": dev_data, "label": dev_label}
    return DataFrame(temp_map)

In [6]:
def get_test_Data():
    test_ids = open("data/test.data.txt", "r")
    test_data = []
    for test_ids_str in test_ids.readlines():
        test_ids_list = test_ids_str.strip().split(",")
        temp_json_list = []
        if not os.path.exists("data/tweet-objects/" + test_ids_list[0] + ".json"):
            continue
        for test_id in test_ids_list:
            test_path = "data/tweet-objects/" + test_id + ".json"
            if os.path.exists(test_path):
                temp_json_list.append(json.load(open(test_path, "r")))
        # sort according to time
        temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], "%a %b %d %H:%M:%S +0000 %Y")))
        test_data.append(temp_json_list)
        temp_map = {"data": test_data}
    return DataFrame(temp_map)

In [7]:
def get_covid_Data():
    test_ids = open("data/covid.data.txt", "r")
    test_data = []
    for test_ids_str in test_ids.readlines():
        test_ids_list = test_ids_str.strip().split(",")
        temp_json_list = []
        if not os.path.exists("data/covid_object/" + test_ids_list[0] + ".json"):
            continue
        for test_id in test_ids_list:
            test_path = "data/covid_object/" + test_id + ".json"
            if os.path.exists(test_path):
                temp_json_list.append(json.load(open(test_path, "r")))
        # sort according to time
        temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], "%a %b %d %H:%M:%S +0000 %Y")))
        test_data.append(temp_json_list)
        temp_map = {"data": test_data}
    return DataFrame(temp_map)

In [8]:
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=True)

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [10]:
def joinText(data):
    input_text = []
    labels = []
    for x in range(len(data["data"])):
        x_text = []
        for y in range(len(data["data"][x])):
            x_text.append(preprocess(data["data"][x][y]["text"]))
        input_text.append(tokenizer.sep_token.join(x_text))
        labels.append(data["label"][x])
        temp_map = {"text": input_text, "label": labels}
    return DataFrame(temp_map)

In [11]:
def join_test_text(data):
    input_text = []
    for x in range(len(data["data"])):
        x_text = []
        for y in range(len(data["data"][x])):
            x_text.append(preprocess(data["data"][x][y]["text"]))
        input_text.append(tokenizer.sep_token.join(x_text))
        temp_map = {"text": input_text}
    return DataFrame(temp_map)

In [12]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        if t.startswith('@') and len(t) > 1:
            t = "@user"
        if t.startswith("http"):
            t = "http"
#         t = emoji.replace_emoji(t, replace="")
        t = re.sub(emoji.get_emoji_regexp(), r"", t)
        t = t.lower()
        new_text.append(t)
    return " ".join(new_text)

In [13]:
train_data = get_train_Data()
train_data = joinText(train_data)
train_text = train_data["text"].values.tolist()
train_label = train_data["label"].values.tolist()

  t = re.sub(emoji.get_emoji_regexp(), r"", t)


In [14]:
dev_data = get_dev_Data()
dev_data = joinText(dev_data)
dev_text = dev_data["text"].values.tolist()
dev_label = dev_data["label"].values.tolist()

  t = re.sub(emoji.get_emoji_regexp(), r"", t)


In [15]:
test_data = get_test_Data()
test_data = join_test_text(test_data)
test_text = test_data["text"].values.tolist()

  t = re.sub(emoji.get_emoji_regexp(), r"", t)


In [16]:
covid_data = get_covid_Data()
covid_data = join_test_text(covid_data)
covid_text = covid_data["text"].values.tolist()

  t = re.sub(emoji.get_emoji_regexp(), r"", t)


In [17]:
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(train_data['label']), y = train_data['label'])
print(class_weights)

[0.62760835 2.4591195 ]


In [18]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
preprocessed_text = preprocessor(text_input)
outputs = encoder(preprocessed_text)

l = tf.keras.layers.Dropout(0.6, name="dropout")(outputs["pooled_output"])
l = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(outputs["pooled_output"])

model = tf.keras.Model(inputs=[text_input], outputs=[l])

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [20]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name="precision"),
    tf.keras.metrics.Recall(name="recall"),
#     tfa.metrics.F1Score(name="F1",num_classes=2)
]

In [21]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
              loss="binary_crossentropy",
              metrics=METRICS)

In [22]:
model.fit(train_text, train_label, batch_size=2, epochs=4, 
          validation_data=(dev_text, dev_label),
          callbacks = EarlyStopping(monitor='val_loss', patience=3, verbose=2),)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x25d4b0c5ac0>

In [23]:
model.evaluate(dev_text, dev_label)



[0.14833712577819824,
 0.9477611780166626,
 0.9223300814628601,
 0.8260869383811951]

In [24]:
test_label = model.predict(test_text)

In [25]:
test_predicted = np.where(test_label > 0.5, 1, 0)

In [26]:
test_prediction = []
for i in range(len(test_predicted)):
    test_prediction.append(test_predicted[i][0])

In [27]:
index = range(len(test_prediction))
res_map = {"Id":index, "Predicted":test_prediction}
df = DataFrame(res_map)
df.to_csv("bert_predict12.csv", index=False)

In [28]:
covid_label = model.predict(covid_text)
covid_predicted = np.where(covid_label > 0.5, 1, 0)
covid_prediction = []
for i in range(len(covid_prediction)):
    covid_prediction.append(covid_predicted[i][0])

In [29]:
index = range(len(covid_prediction))
res_map = {"Id":index, "Predicted":covid_prediction}
df = DataFrame(res_map)
df.to_csv("covid_prediction.csv", index=False)