In [None]:
## This classification code has been adapted from this tutorial
## https://www.section.io/engineering-education/classification-model-using-bert-and-tensorflow/

In [None]:
!pip install chitchat_dataset
import pandas as pd

In [None]:
import chitchat_dataset as ccc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Read chitchat dataset
dataset = ccc.Dataset()
messages = list(ccc.MessageDataset())

In [None]:
cc_df = pd.DataFrame(messages, columns = ["Text"])
cc_df['tag'] = "chitchat"

In [None]:
#Read Reddit dataset
r_df = pd.read_pickle('/content/drive/MyDrive/irp4/full_data_new_1.pkl')

In [None]:
r_d_list1 = list(r_df['title'])
r_d_list1 = r_d_list1 + list(r_df.loc[r_df.selftext == r_df.body,'body'])
r_d_list1 = r_d_list1 + list(r_df.loc[r_df.selftext != r_df.body,'selftext'])
r_d_list1 = r_d_list1 + list(r_df.loc[r_df.selftext != r_df.body,'body'])

rr_df = pd.DataFrame(r_d_list1, columns=["Text"])
rr_df = rr_df[rr_df.Text != '']
rr_df['tag'] = "reddit" #tag the dataset
rr_df

In [None]:
rr_df.drop_duplicates('Text', inplace=True)
rr_df.reset_index(drop=True,inplace=True)
rr_df.shape

In [None]:
t_data = pd.concat([cc_df, rr_df], axis=0, ignore_index=True)
t_data['label'] = t_data['tag'].factorize()[0]
t_data #concatenate the two tagged datasets

In [None]:
# Number of words
t_data['Text'].apply(lambda x: len(x.split(' '))).sum()

In [None]:
import matplotlib.pyplot as plt
tags = ['chitchat','reddit']
plt.figure(figsize=(10,4))
t_data.tag.value_counts().plot(kind='bar');

In [None]:
# Pre-process text
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

In [None]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    
    
    processed_text = BeautifulSoup(text, "lxml").text    
    processed_text = re.sub(r"[/(){}\[\]\|@,;]", " ", processed_text.lower())
    processed_text = re.sub(r"[^0-9a-z #+_]", "", processed_text)
    processed_text = re.sub(' +', ' ', processed_text.strip())
    
    stemmer = WordNetLemmatizer()
    
#     processed_text = ' '.join(stemmer.lemmatize(word) for word in processed_text.split() if word not in stop_words)
#    processed_text = ' '.join(stemmer.lemmatize(word) for word in processed_text.split())
    return processed_text

In [None]:
t_data['Text_p'] = t_data['Text'].apply(preprocess_text)

In [None]:
t_data[t_data.Text == ""]

In [None]:
t_data
t_data_copy = t_data.copy()
t_data = t_data[t_data.Text!='']
t_data.shape[0], t_data_copy.shape[0]

In [None]:
df_cc = t_data[t_data['tag']=='chitchat']
df_rd = t_data[t_data['tag']=='reddit']
df_rd_downsampled = df_rd.sample(df_cc.shape[0])
df_balanced = pd.concat([df_rd_downsampled, df_cc], axis=0, ignore_index=True)
df_balanced['label'].value_counts()

In [None]:
df_balanced.to_pickle('/content/drive/MyDrive/df_balanced.pkl')

In [None]:
!pip install tensorflow_text

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['Text'],df_balanced['label'], stratify=df_balanced['label'])

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

In [None]:
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

In [None]:
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=3)

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("/content/drive/MyDrive/irp4/model_new.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("/content/drive/MyDrive/irp4/model_new.h5")
print("Saved model to disk")

In [None]:
# load json and create model
json_file = open('/content/drive/MyDrive/irp4/model_new.json', 'r')
loaded_model_json = json_file.read()
json_file.close()


In [None]:
from tensorflow.keras.models import model_from_json
loaded_model = model_from_json(loaded_model_json,
       custom_objects={'KerasLayer':hub.KerasLayer})
# load weights into new model
loaded_model.load_weights("/content/drive/MyDrive/irp4/model_new.h5")
print("Loaded model from disk")

In [None]:
model.predict(["Tell me about covid cases",
               "Hello How are you doing?",
               "What is global warming?",
               "Who are democrats?",
               "what is the meaning of life?",
               "what is an educational reform?",
               "He is spamming the chat and it is really hard to do anything",
               "But the rules say no competition talk so... i learned that you get a heart attack from a build up of cholesterol breaking and the blood clotting on a coronary article",
               "My girlfriend came to where I was working which was the library the media center. I was doing camera check outs"])

In [None]:
loaded_model.predict(["Tell me about covid cases",
               "Hello How are you doing?",
               "What is global warming?",
               "Who are democrats?",
               "what is the meaning of life?",
               "what is an educational reform?",
               "He is spamming the chat and it is really hard to do anything",
               "But the rules say no competition talk so... i learned that you get a heart attack from a build up of cholesterol breaking and the blood clotting on a coronary article",
               "My girlfriend came to where I was working which was the library the media center. I was doing camera check outs"])

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
sum(y_predicted == y_test)