In [1]:
import torch
from transformers import AutoModel, AutoTokenizer 
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

# For transformers v4.x+: 
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
tweets = pd.read_csv('all_china_full.csv')
tweets = tweets.dropna() # some rows come in as blank so they need to be dropped
train, test = train_test_split(tweets, test_size=0.2)

In [5]:
# this cell tests for invalid or blank text entries and prints their index if one comes up

for idx, tweet in enumerate(text):
    try: 
        tokenizer(tweet, padding='max_length', max_length=150, return_tensors="pt")
    except:
        print(idx, tweet)

In [29]:
def batch_tokenize_and_embed(tweets, batch_size = 5):
    
    embeddings = torch.Tensor()
    
    for i in range(0, len(tweets), batch_size):
        batch = tweets[i : min(len(tweets), i+batch_size)]
        print("Processing chunk " + str(i) + " to " + str(i + len(batch)))
        
        tokens = tokenizer(batch, padding='max_length', max_length=130, return_tensors="pt")
        
        with torch.no_grad():
            outputs = bertweet(**tokens)
    
        embeddings = torch.cat((embeddings, outputs.pooler_output)) #pooler_output is an embedding for the entire tweet
        
    return embeddings

In [30]:
train_subset_size = 2000

train_embeddings = batch_tokenize_and_embed(list(train['text'][:train_subset_size]))

Processing chunk 0 to 5
Processing chunk 5 to 10
Processing chunk 10 to 15
Processing chunk 15 to 20
Processing chunk 20 to 25
Processing chunk 25 to 30
Processing chunk 30 to 35
Processing chunk 35 to 40
Processing chunk 40 to 45
Processing chunk 45 to 50
Processing chunk 50 to 55
Processing chunk 55 to 60
Processing chunk 60 to 65
Processing chunk 65 to 70
Processing chunk 70 to 75
Processing chunk 75 to 80
Processing chunk 80 to 85
Processing chunk 85 to 90
Processing chunk 90 to 95
Processing chunk 95 to 100
Processing chunk 100 to 105
Processing chunk 105 to 110
Processing chunk 110 to 115
Processing chunk 115 to 120
Processing chunk 120 to 125
Processing chunk 125 to 130
Processing chunk 130 to 135
Processing chunk 135 to 140
Processing chunk 140 to 145
Processing chunk 145 to 150
Processing chunk 150 to 155
Processing chunk 155 to 160
Processing chunk 160 to 165
Processing chunk 165 to 170
Processing chunk 170 to 175
Processing chunk 175 to 180
Processing chunk 180 to 185
Proces

Processing chunk 1440 to 1445
Processing chunk 1445 to 1450
Processing chunk 1450 to 1455
Processing chunk 1455 to 1460
Processing chunk 1460 to 1465
Processing chunk 1465 to 1470
Processing chunk 1470 to 1475
Processing chunk 1475 to 1480
Processing chunk 1480 to 1485
Processing chunk 1485 to 1490
Processing chunk 1490 to 1495
Processing chunk 1495 to 1500
Processing chunk 1500 to 1505
Processing chunk 1505 to 1510
Processing chunk 1510 to 1515
Processing chunk 1515 to 1520
Processing chunk 1520 to 1525
Processing chunk 1525 to 1530
Processing chunk 1530 to 1535
Processing chunk 1535 to 1540
Processing chunk 1540 to 1545
Processing chunk 1545 to 1550
Processing chunk 1550 to 1555
Processing chunk 1555 to 1560
Processing chunk 1560 to 1565
Processing chunk 1565 to 1570
Processing chunk 1570 to 1575
Processing chunk 1575 to 1580
Processing chunk 1580 to 1585
Processing chunk 1585 to 1590
Processing chunk 1590 to 1595
Processing chunk 1595 to 1600
Processing chunk 1600 to 1605
Processing

In [31]:
X_train = train_embeddings.detach().numpy()
y_train = train['is_ccp'][:train_subset_size]

In [32]:
linear_model = keras.Sequential([
    # Dense is an affine (xW + b) layer followed by an element wise nonlinearity.
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(4, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# adam optimizer is a fancier version of gradient descent.  You can read more about it here: https://arxiv.org/pdf/1412.6980.pdf
linear_model.compile(optimizer='adam',
              loss='binary_crossentropy',  # From information theory notebooks.
              metrics=['accuracy'])        # What metric to output as we train.

In [33]:
linear_model.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fd442b855e0>

# Testing

In [34]:
test_subset_size = 500

test_embeddings = batch_tokenize_and_embed(list(test['text'][:test_subset_size]))

Processing chunk 0 to 5
Processing chunk 5 to 10
Processing chunk 10 to 15
Processing chunk 15 to 20
Processing chunk 20 to 25
Processing chunk 25 to 30
Processing chunk 30 to 35
Processing chunk 35 to 40
Processing chunk 40 to 45
Processing chunk 45 to 50
Processing chunk 50 to 55
Processing chunk 55 to 60
Processing chunk 60 to 65
Processing chunk 65 to 70
Processing chunk 70 to 75
Processing chunk 75 to 80
Processing chunk 80 to 85
Processing chunk 85 to 90
Processing chunk 90 to 95
Processing chunk 95 to 100
Processing chunk 100 to 105
Processing chunk 105 to 110
Processing chunk 110 to 115
Processing chunk 115 to 120
Processing chunk 120 to 125
Processing chunk 125 to 130
Processing chunk 130 to 135
Processing chunk 135 to 140
Processing chunk 140 to 145
Processing chunk 145 to 150
Processing chunk 150 to 155
Processing chunk 155 to 160
Processing chunk 160 to 165
Processing chunk 165 to 170
Processing chunk 170 to 175
Processing chunk 175 to 180
Processing chunk 180 to 185
Proces

In [35]:
X_test = test_embeddings.detach().numpy()
y_test = test['is_ccp'][:test_subset_size]

In [52]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = linear_model.predict(X_test)
y_pred_bool = np.where(y_pred >= 0.5, 1, 0).ravel() #DIY function to round outputs to 0 or 1

print(classification_report(y_test, y_pred_bool))
print(confusion_matrix(y_test, y_pred_bool))

              precision    recall  f1-score   support

         0.0       0.85      0.98      0.91       208
         1.0       0.98      0.88      0.93       292

    accuracy                           0.92       500
   macro avg       0.92      0.93      0.92       500
weighted avg       0.93      0.92      0.92       500

[[203   5]
 [ 35 257]]


In [38]:
linear_model.evaluate(X_test, y_test)    



[0.23062360286712646, 0.9200000166893005]

In [51]:
np.where(y_pred >= 0.5, 1, 0).ravel()

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,

In [45]:
y_test

33969    1.0
1117     1.0
17702    1.0
9627     1.0
45866    0.0
        ... 
36334    1.0
37655    1.0
34140    1.0
13758    1.0
55876    0.0
Name: is_ccp, Length: 500, dtype: float64