In [None]:
import tensorflow as tf

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install tensorflow_text
# !pip install transformers
# !pip install emoji

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.layers import LSTM
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from transformers import TFBertForSequenceClassification
from tensorflow import keras
import os
import shutil
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer
import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')
#from google.colab import drive

In [None]:
#drive.mount('/content/drive')

# Data Preprocessing

In [None]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='latin-1',header=None)

In [None]:
# Convert labels with 4 to 1
df[0] = df[0].apply(lambda x : 1 if x > 0 else 0)

In [None]:
df = shuffle(df)

In [None]:
test = df[500:1000] #df[1280000:1600000]
df = df[0:500] #df[:1280000]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

In [None]:
labels = df[0]

In [None]:
SEQ_LEN = 65
def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=SEQ_LEN,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [None]:
Xids = np.zeros((len(df), SEQ_LEN))
Xmask = np.zeros((len(df), SEQ_LEN))

In [None]:
for i, sentence in enumerate(df[5]):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)

In [None]:
with open('xids.npy','wb') as f:
    np.save(f, Xids)

with open('xmask.npy', 'wb') as f:
    np.save(f, Xmask)

with open('labels.npy', 'wb') as f:
    np.save(f, labels)

del df, Xids, Xmask, labels

In [None]:
with open('xids.npy','rb') as fp:
    Xids = np.load(fp)

with open('xmask.npy', 'rb') as fp:
    Xmask = np.load(fp)

with open('labels.npy', 'rb') as fp:
    labels = np.load(fp)



In [None]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))


In [None]:
# restructure dataset format for BERT
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# apply the mapping function
dataset = dataset.map(map_func)

In [None]:
# shuffle and batch the dataset
dataset = dataset.shuffle(10000).batch(32)

# get dataset length
DS_LEN = len(list(dataset))  

# create 90-10 split
SPLIT = 0.9

# create training-validation sets
train = dataset.take(round(DS_LEN*SPLIT))
val = dataset.skip(round(DS_LEN*SPLIT))

# Build model

In [None]:
# build the model
input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')


In [None]:
bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")

In [None]:
# get last hidden state only
embeddings = bertweet.roberta(input_ids, attention_mask=mask)[0]

X = tf.keras.layers.Dropout(0.1)(embeddings)
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768))(X)
y = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(X)


In [None]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

In [None]:
# freeze bertweet layer during training
model.layers[2].trainable = False

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.BinaryCrossentropy()
acc = tf.keras.metrics.BinaryAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
history = model.fit(train,
                    validation_data = val,
                    epochs=1)

# Save and Load Model

In [None]:
#tf.keras.models.save_model(model, 'drive/MyDrive/twitter-sentiment-model-2', overwrite=True, include_optimizer=True, save_format="tf")


In [None]:
loaded_model = tf.keras.models.load_model('twitter-sentiment-model-2')

# Using model on test data

In [None]:
single_test_ids = np.zeros((len(test), SEQ_LEN))
single_test_mask = np.zeros((len(test), SEQ_LEN))

In [None]:
for i, sentence in enumerate(test[5]):
    print(sentence)
    single_test_ids[i, :], single_test_mask[i, :] = tokenize(sentence)




In [None]:
predictions = loaded_model.predict([single_test_ids, single_test_mask])