In [1]:
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

# For transformers v4.x+: 
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
tweets = pd.read_csv('../bertweet_embeddings/pro_china_full_nort.csv')
tweets = tweets.dropna().reset_index() # some rows come in as blank so they need to be dropped - also need to reset index so they can match embeddings later

train, test = train_test_split(tweets, test_size=0.2, random_state = 0)

In [4]:
# this cell tests for invalid or blank text entries and prints their index if one comes up

# for idx, tweet in enumerate(tweets['text']):
#     try: 
#         tokenizer(tweet, padding='max_length', max_length=130, return_tensors="pt")
#     except:
#         print(idx, tweet)

In [5]:
# reading in tensors from file

embeddings = torch.Tensor()

for i in range(39):
    filename = "../bertweet_embeddings/embeddings/pro_china_embedding_" + str(i*1000) + ".pt"
    embeddings = torch.cat((embeddings, torch.load(filename)))
    
print(embeddings.shape, tweets.shape) # these should be in agreement

torch.Size([38734, 768]) (38734, 4)


In [6]:
X_train = embeddings[train.index].detach().numpy()
y_train = train['is_ccp']

print(X_train.shape, y_train.shape)

(30987, 768) (30987,)


In [7]:
# model = keras.Sequential()

# model.add(keras.layers.Dense(512, 
#                              activation = 'relu',
#                              kernel_regularizer=keras.regularizers.l1_l2(l1=1e-5, l2=1e-4),
#                              bias_regularizer=keras.regularizers.l2(1e-4),
#                              activity_regularizer=keras.regularizers.l2(1e-5)))
# model.add(keras.layers.Dropout(0.1))
# model.add(keras.layers.Dense(1, activation = 'sigmoid'))

# # adam optimizer is a fancier version of gradient descent.  You can read more about it here: https://arxiv.org/pdf/1412.6980.pdf
# optimizer = keras.optimizers.Adam(clipvalue=1)

# model.compile(optimizer=optimizer,
#               loss='binary_crossentropy',  # From information theory notebooks.
#               metrics=['accuracy'])        # What metric to output as we train.

In [8]:
model = keras.Sequential()

#model.add(keras.layers.Normalization())

model.add(keras.layers.Dense(16, 
                             activation = 'relu'))

model.add(keras.layers.Dense(1, activation = 'sigmoid'))

#optimizer = keras.optimizers.Adam(clipvalue=1, learning_rate=0.0001)

model.compile(optimizer='adam',
              loss='binary_crossentropy', 
              metrics=['accuracy']) 

In [9]:
model.fit(X_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9e47789340>

# Testing

In [10]:
X_test = embeddings[test.index].detach().numpy()
y_test = test['is_ccp']

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred_bool = np.where(y_pred >= 0.5, 1, 0).ravel() #DIY function to round outputs to 0 or 1

print(classification_report(y_test, y_pred_bool))
print(confusion_matrix(y_test, y_pred_bool))

              precision    recall  f1-score   support

         0.0       0.95      0.94      0.95      3857
         1.0       0.94      0.95      0.95      3890

    accuracy                           0.95      7747
   macro avg       0.95      0.95      0.95      7747
weighted avg       0.95      0.95      0.95      7747

[[3639  218]
 [ 201 3689]]


In [12]:
model.evaluate(X_test, y_test)    



[0.13970305025577545, 0.9459145665168762]

In [13]:
# one line tester

line = "xinjiang is a slavery concentration camp"
line_token = tokenizer(line, padding='max_length', max_length=130, return_tensors="pt")
line_embed = bertweet(**line_token)

model.predict(line_embed.pooler_output.detach().numpy())

array([[0.8776716]], dtype=float32)

# Test on All China set

In [20]:
tweets_all = pd.read_csv('../bertweet_embeddings/all_china_full_nort.csv')
tweets_all = tweets_all.dropna() # some rows come in as blank so they need to be dropped
xxx, test_all = train_test_split(tweets_all, test_size=0.5, random_state=0)

In [21]:
embeddings_all = torch.Tensor()

for i in range(39):
    filename = "../bertweet_embeddings/embeddings/all_china_embedding_" + str(i*1000) + ".pt"
    embeddings_all = torch.cat((embeddings_all, torch.load(filename)))
    
print(embeddings_all.shape, tweets.shape)

torch.Size([38733, 768]) (38734, 4)


In [22]:
X_test_all = embeddings_all[test_all.index].detach().numpy()
y_test_all = test_all['is_ccp']

from sklearn.metrics import classification_report, confusion_matrix

y_pred_all = model.predict(X_test_all)
y_pred_all_bool = np.where(y_pred_all >= 0.5, 1, 0).ravel() #DIY function to round outputs to 0 or 1

print(classification_report(y_test_all, y_pred_all_bool))
print(confusion_matrix(y_test_all, y_pred_all_bool))

model.evaluate(X_test_all, y_test_all)   

              precision    recall  f1-score   support

         0.0       0.94      0.76      0.84      9726
         1.0       0.80      0.95      0.87      9641

    accuracy                           0.86     19367
   macro avg       0.87      0.86      0.85     19367
weighted avg       0.87      0.86      0.85     19367

[[7405 2321]
 [ 484 9157]]


[0.39753246307373047, 0.8551660180091858]