In [37]:
import torch
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import time

In [38]:
dataset = 'all'

In [39]:
tweets = pd.read_csv('../bertweet_embeddings/' + dataset + '_china_full_nort.csv')
tweets = tweets.dropna().reset_index() # some rows come in as blank so they need to be dropped - also need to reset index so they can match embeddings later

train, test = train_test_split(tweets, test_size=0.2, random_state = 0)

In [40]:
# reading in tensors from file

embeddings = torch.Tensor()

for i in range(39):
    filename = "../bertweet_embeddings/embeddings/" + dataset + "_china_embedding_" + str(i*1000) + ".pt"
    embeddings = torch.cat((embeddings, torch.load(filename)))
    
print(embeddings.shape, tweets.shape) # these should be in agreement

torch.Size([38733, 768]) (38733, 4)


In [41]:
X_train = embeddings[train.index].detach().numpy()
y_train = train['is_ccp']

print(X_train.shape, y_train.shape)

(30986, 768) (30986,)


In [42]:
X_test = embeddings[test.index].detach().numpy()
y_test = test['is_ccp']

print(X_test.shape, y_test.shape)

(7747, 768) (7747,)


In [7]:
model_adagrad = keras.Sequential()
model_adagrad.add(keras.layers.Dense(16, 
                             activation = 'relu'))

model_adagrad.add(keras.layers.Dense(1, activation = 'sigmoid'))
model_adagrad.compile(optimizer='adagrad',
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [8]:
def create_dense_model(num_dense_layers, num_units):
    model = keras.Sequential()
    for _ in range(num_dense_layers):
        model.add(keras.layers.Dense(num_units, 
                                     activation = 'relu'))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [9]:
def create_simple_dropout_model(d):
    model = keras.Sequential()
    model.add(keras.layers.Dense(16, 
                                 activation = 'relu'))
    model.add(keras.layers.Dropout(0.2))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [10]:
def create_simple_conv_model(filters, kernel_size, strides):
    model = keras.Sequential()
    model.add(keras.layers.Reshape((1, 768)))
    model.add(keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, strides=strides, input_shape=(1, 768), padding='same'))
    model.add(keras.layers.Dense(16, 
                                 activation = 'relu'))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [11]:
def create_simple_avgpool_model(pool_size, strides):
    model = keras.Sequential()
    model.add(keras.layers.Reshape((1, 768)))
    model.add(keras.layers.AveragePooling1D(pool_size=pool_size, strides=strides, input_shape=(1, 768), padding='same'))
    model.add(keras.layers.Dense(16, 
                                 activation = 'relu'))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [43]:
def fit_and_score_model(model, model_name, epochs=20, i=0):
    
    start = time.time()
    model.fit(X_train, y_train, epochs=epochs)
    end = time.time()
    
    y_pred = model.predict(X_test)
    y_pred_bool = np.where(y_pred >= 0.5, 1, 0).ravel() #DIY function to round outputs to 0 or 1

    cr = classification_report(y_test, y_pred_bool, output_dict=True, digits=4)
    #print(confusion_matrix(y_test, y_pred_bool))
    
    scores = pd.DataFrame({'training_time': [end - start],
                           'precision': cr['weighted avg']['precision'],
                           'recall' : cr['weighted avg']['recall'],
                           'f1' : cr['weighted avg']['f1-score']}, index = [model_name])
    
    return scores

In [44]:
def bulk_evaluate_models(model_dict, epochs=20):
    
    results = pd.DataFrame()
    
    for model_name in model_dict:
        results = pd.concat([results, fit_and_score_model(model_dict[model_name], model_name, epochs)])
        
    return results

In [14]:
model_dict = {'baseline': create_dense_model(1, 16),
             'dense_1_64': create_dense_model(1, 64),
             'dense_1_256': create_dense_model(1, 256),
             'dense_2_64': create_dense_model(2, 64),
             'dense_2_256': create_dense_model(2, 256),
             'dropout_0.1': create_simple_dropout_model(0.1),
             'dropout_0.2': create_simple_dropout_model(0.2),
             'dropout_0.3': create_simple_dropout_model(0.3),
             'avgpool_3_None': create_simple_avgpool_model(3, None),
             'avgpool_5_None': create_simple_avgpool_model(5, None),
             'avgpool_7_None': create_simple_avgpool_model(7, None),
             'avgpool_3_1': create_simple_avgpool_model(3, 1),
             'avgpool_5_2': create_simple_avgpool_model(5, 2),
             'convolution_3_3_1': create_simple_conv_model(3, 3, 1),
             'convolution_5_5_1': create_simple_conv_model(5, 5, 1),
             'convolution_7_5_1': create_simple_conv_model(7, 5, 1),
             'convolution_5_7_1': create_simple_conv_model(5, 7, 1),
             'convolution_3_3_3': create_simple_conv_model(3, 3, 3),
             'optimizer_adagrad': model_adagrad}

model_performance = bulk_evaluate_models(model_dict)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/

In [15]:
model_performance

Unnamed: 0,training_time,precision,recall,f1
baseline,41.526285,0.943574,0.943075,0.943054
dense_1_64,23.608913,0.960653,0.96063,0.96063
dense_1_256,37.104236,0.956595,0.955596,0.955567
dense_2_64,25.238492,0.960245,0.960243,0.960243
dense_2_256,46.685268,0.965194,0.965148,0.965148
dropout_0.1,41.368037,0.953392,0.953272,0.953271
dropout_0.2,41.406643,0.945831,0.945785,0.945785
dropout_0.3,41.387655,0.95127,0.951078,0.951075
avgpool_3_None,41.492336,0.944509,0.943075,0.943036
avgpool_5_None,41.474525,0.946447,0.946173,0.946167


In [45]:
model_peak = keras.Sequential()

#model_peak.add(keras.layers.Reshape((1, 768)))
#model_peak.add(keras.layers.AveragePooling1D(pool_size=3, strides=None, input_shape=(1, 768), padding='same'))

model_peak.add(keras.layers.Dense(768, activation = 'relu'))
model_peak.add(keras.layers.Dropout(0.05))

# model_peak.add(keras.layers.Dense(768, activation = 'relu'))
# model_peak.add(keras.layers.Dropout(0.2))

model_peak.add(keras.layers.Dense(1, activation = 'sigmoid'))
model_peak.compile(optimizer='adam',
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [46]:
model_peak_performance = bulk_evaluate_models({'model_peak': model_peak}, epochs=50)
model_peak_performance

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


Unnamed: 0,training_time,precision,recall,f1
model_peak,202.352844,0.965786,0.965664,0.965659


In [47]:
y_pred_peak = model_peak.predict(X_test)
y_pred_peak_bool = np.where(y_pred_peak >= 0.5, 1, 0).ravel()

print(confusion_matrix(y_test, y_pred_peak_bool))

[[3807  101]
 [ 165 3674]]


## What Did the Model Get the Most Wrong

In [48]:
model = model_peak

In [49]:
y_pred = np.array([i for [i] in model.predict(X_test)])
y_pred_deltas = y_pred - y_test.array
y_deltas_argsort = np.argsort(y_pred_deltas)

In [50]:
num_results = 20

false_negatives = pd.DataFrame(columns=['text', 'probability', 'is_ccp'])

for i in range(num_results):
    details = pd.DataFrame({'text': test.iloc[y_deltas_argsort[i]].text, 'probability': y_pred[y_deltas_argsort[i]], 'is_ccp': test.iloc[y_deltas_argsort[i]].is_ccp}, index=[i + 1])
    false_negatives = pd.concat([false_negatives, details])
    
false_positives = pd.DataFrame(columns=['text', 'probability', 'is_ccp'])

for i in range(num_results):
    details = pd.DataFrame({'text': test.iloc[y_deltas_argsort[-(i+1)]].text, 'probability': y_pred[y_deltas_argsort[-(i+1)]], 'is_ccp': test.iloc[y_deltas_argsort[-(i+1)]].is_ccp}, index=[i + 1])
    false_positives = pd.concat([false_positives, details])

In [51]:
false_negatives

Unnamed: 0,text,probability,is_ccp
1,The West Isn’t Ready for the Coming Wave of Ch...,2.191319e-08,1.0
2,@TonyBrunoShow @KingJames @NBA @nikebasketball...,2.764007e-08,1.0
3,RT @AngeloG86207806: 1871 Chinatown massacre\n...,5.285465e-08,1.0
4,"With Message For China, Dalai Lama Says His Su...",4.03117e-07,1.0
5,Bannon created novel coronavirus conspiracy th...,4.847097e-07,1.0
6,"@RNH00757249 @PDChina Well,I see. https://t.co...",6.274943e-07,1.0
7,"The claim of “since 2017, the Chinese governme...",9.810135e-07,1.0
8,#DrLiMengYan、#闫丽梦、#班农 \nRumor Press Conference...,1.118966e-06,1.0
9,#StopXinjiangrumors！\nThis “jade expert” ident...,2.559877e-06,1.0
10,@Terry24492280 @OzraeliAvi Since the rising of...,2.807547e-06,1.0


In [53]:
false_positives

Unnamed: 0,text,probability,is_ccp
1,We pledge: No more tears on our land\nIn wrath...,0.999959,0.0
2,Get Lost in a #Snow-Blanketed #Wonderland this...,0.999473,0.0
3,@VOAChinese The cotton produced in #Xinjiang i...,0.999389,0.0
4,"@CNBC Liars and terrorism supporters, shame on...",0.999,0.0
5,Men's Curling World Championship: #Scotland be...,0.99881,0.0
6,@hoopstingley @Vanilla3087 @TrueNorthCentre @T...,0.99879,0.0
7,"@lukedepulford @RazvenHK Hong Kong is vibrant,...",0.998419,0.0
8,"A look at Langzhong ancient city in Nanchong, ...",0.997336,0.0
9,It should be noted that Hong Kong is facing mu...,0.99662,0.0
10,#CyberAttack on Inter Parliamentary Alliance o...,0.99513,0.0


In [54]:
false_positives.to_csv(dataset + '_china_false_positives.csv')
false_negatives.to_csv(dataset + '_china_false_negatives.csv')

## What was the model least certain about?

In [55]:
y_pred = np.array([i for [i] in model.predict(X_test)])
y_pred_uncertain = np.abs(y_pred - 0.5)
y_uncertain_argsort = np.argsort(y_pred_uncertain)

In [56]:
num_results = 20

uncertains = pd.DataFrame(columns=['text', 'probability', 'is_ccp'])

for i in range(num_results):
    details = pd.DataFrame({'text': test.iloc[y_uncertain_argsort[i]].text, 'probability': y_pred[y_uncertain_argsort[i]], 'is_ccp': test.iloc[y_uncertain_argsort[i]].is_ccp}, index=[i + 1])
    uncertains = pd.concat([uncertains, details])

In [57]:
uncertains.to_csv(dataset + '_china_least_certains.csv')

In [58]:
uncertains

Unnamed: 0,text,probability,is_ccp
1,"11 killed, 19 injured in China truck-bus highw...",0.506093,0.0
2,#ChinaToday in case you missed it: - Xi stress...,0.510257,1.0
3,#ChinaToday in case you missed it:\n- Highligh...,0.510384,1.0
4,Kerry: US 'hopeful' it can work with China to ...,0.512026,0.0
5,@oU1QSfoQi1MfaF5 Guo Wengui has everything for...,0.512959,0.0
6,I wonder what would the world say if this happ...,0.487033,1.0
7,When China grounded its Boeing 737 MAX planes ...,0.51477,1.0
8,@JianluBi Xinjiang people dance whenever they ...,0.484868,1.0
9,"Speaking of #郭文贵 friends, that's really a nove...",0.482069,1.0
10,Minibus driver dies in car explosion in NW Chi...,0.476607,1.0


## Logistic Regression Classifier for Reference

In [59]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [60]:
y_clf_pred = clf.predict(X_test)
y_clf_pred

array([0., 0., 1., ..., 1., 0., 1.])

In [61]:
print(classification_report(y_test, y_clf_pred,  digits=4))
print(confusion_matrix(y_test, y_clf_pred))

              precision    recall  f1-score   support

         0.0     0.9480    0.9284    0.9381      3908
         1.0     0.9286    0.9482    0.9383      3839

    accuracy                         0.9382      7747
   macro avg     0.9383    0.9383    0.9382      7747
weighted avg     0.9384    0.9382    0.9382      7747

[[3628  280]
 [ 199 3640]]


## Testing on other set

In [62]:
dataset_other = 'all' if dataset == 'pro' else 'pro'

tweets_other = pd.read_csv('../bertweet_embeddings/' + dataset_other + '_china_full_nort.csv')
tweets_other = tweets_other.dropna() # some rows come in as blank so they need to be dropped
xxx, test_other = train_test_split(tweets_other, test_size=0.5, random_state=0)

In [63]:
embeddings_other = torch.Tensor()

for i in range(39):
    filename = "../bertweet_embeddings/embeddings/" + dataset_other + "_china_embedding_" + str(i*1000) + ".pt"
    embeddings_other = torch.cat((embeddings_other, torch.load(filename)))
    
print(embeddings_other.shape, tweets_other.shape)

torch.Size([38734, 768]) (38734, 3)


In [64]:
X_test_other = embeddings_other[test_other.index].detach().numpy()
y_test_other = test_other['is_ccp']

y_pred_other = model.predict(X_test_other)
y_pred_other_bool = np.where(y_pred_other >= 0.5, 1, 0).ravel() #DIY function to round outputs to 0 or 1

report = classification_report(y_test_other, y_pred_other_bool, output_dict=True, digits=4)
df = pd.DataFrame(report).transpose()

print(confusion_matrix(y_test_other, y_pred_other_bool,))

df

[[8896  830]
 [ 203 9438]]


Unnamed: 0,precision,recall,f1-score,support
0.0,0.97769,0.914662,0.945126,9726.0
1.0,0.919166,0.978944,0.948114,9641.0
accuracy,0.946662,0.946662,0.946662,0.946662
macro avg,0.948428,0.946803,0.94662,19367.0
weighted avg,0.948557,0.946662,0.946613,19367.0
