In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
import re
from nltk.tokenize import word_tokenize
import gensim
import string
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification

# Step 1: Loading and Preparing Training data
We obtained training data from a previously run research project https://github.com/ellamguest/online-misogyny-eacl2021/tree/main/data 

In [None]:
# Loading training data
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/final_labels.csv')
training_data = data[data['body'].notna()]
training_data = data[data['body'].str.len() < 200]

In [None]:
# Converting target labels to 1,0
X = training_data['body']
label = LabelEncoder()
y = label.fit_transform(training_data['level_1'])

In [None]:
# Train, validation, test split of training data
X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size = 0.8, random_state = 42)
X_valid, X_test, y_valid, y_test = train_test_split(X_dev, y_dev, train_size = 0.5, random_state = 42)

# Step 2: Preparing BERT Model 
Converting data to tensors, setting up BERT model, and fitting training data to the model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def inputs(tweets, max_len = 200): 
  """
  Converts tweets (str) into tensor inputs for BERT model 

  Inputs: 
    tweets: (str) tweets
  
  Outputs: 
    input_ids: tweets turned into tensors 
    attention_masks: masks for BERT model 
  """
    input_ids = []
    attention_masks = []
    i = 0
    for tweet in tweets: 
        encoded_dict = tokenizer.encode_plus(
            tweet, 
            add_special_tokens = True, 
            max_length = max_len, 
            pad_to_max_length = True, 
            return_attention_mask = True
    )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
#Converting training data into tensors
train_inp, train_mask = inputs(X_train)
val_inp, val_mask = inputs(X_valid)
train_label = tf.convert_to_tensor(y_train)
val_label = tf.convert_to_tensor(y_valid)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
#Initializing BERT model
bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Initializing BERT parameters
dir = '/content/drive/My Drive/Colab Notebooks/data/CS122/bert_final'
model_save = '/content/drive/My Drive/Colab Notebooks/data/CS122/bert_final/cs122bert_final.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save,
                                                save_weights_only = True,
                                                monitor = 'val_loss',
                                                mode = 'min', 
                                                save_best_only = True), 
             keras.callbacks.TensorBoard(log_dir = dir)]

print('\nBert Model', bert.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon = 1e-08)

bert.compile(loss=loss, optimizer=optimizer, metrics = [metric])

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None


In [None]:
# Fitting BERT model on training data and validating on validation data
misogynist = bert.fit([train_inp, train_mask],
                      train_label, 
                      batch_size = 32, 
                      epochs = 4, 
                      validation_data = ([val_inp, val_mask], val_label), 
                      callbacks = callbacks)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


# Step 3: Predicting labels for test data

In [None]:
# Convering testing data to tensors
test_input, _ = inputs(X_test) 



In [None]:
# Predicting test labels
test_pred = bert.predict(test_input)

In [None]:
# Turning predictions into labels
test_prediction = tf.nn.softmax(test_pred.logits)
test_prediction = tf.argmax(test_prediction, axis=1).numpy()
test_prediction

In [None]:
predictions = pd.DataFrame(test_prediction.T)
predictions

# Step 4: Predicting labels for scraped Twitter data 
We predict the labels for the actual data and save the predictions into a new dataframe

In [None]:
#Loading target data
actual_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/final_data.csv')
actua_data_big = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/final_data_1.csv')

In [None]:
data_actual = pd.concat([actua_data_big, actual_data])
data_actual = data_actual.drop_duplicates(subset = ['candidate_user_name', 'tweet'])
data_actual.reset_index(inplace=True)

In [None]:
predict_input, _ = tensor_inputs(data_actual['tweet'])

In [None]:
miso_prediction = tf.nn.softmax(test_pred.logits)
miso_prediction = tf.argmax(miso_prediction, axis=1).numpy()
miso_prediction

In [None]:
data_actual['Predicted'] = tf_prediction.T

In [None]:
data_actual.to_csv('./predicted_tweets')