In [108]:
import torch
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import time

In [109]:
dataset = 'all'

In [110]:
tweets = pd.read_csv('../bertweet_embeddings/' + dataset + '_china_full_nort.csv')
tweets = tweets.dropna().reset_index() # some rows come in as blank so they need to be dropped - also need to reset index so they can match embeddings later

train, test = train_test_split(tweets, test_size=0.2, random_state = 0)

In [111]:
# reading in tensors from file

embeddings = torch.Tensor()

for i in range(39):
    filename = "../bertweet_embeddings/embeddings/" + dataset + "_china_embedding_" + str(i*1000) + ".pt"
    embeddings = torch.cat((embeddings, torch.load(filename)))
    
print(embeddings.shape, tweets.shape) # these should be in agreement

torch.Size([38733, 768]) (38733, 4)


In [112]:
X_train = embeddings[train.index].detach().numpy()
y_train = train['is_ccp']

print(X_train.shape, y_train.shape)

(30986, 768) (30986,)


In [113]:
X_test = embeddings[test.index].detach().numpy()
y_test = test['is_ccp']

print(X_test.shape, y_test.shape)

(7747, 768) (7747,)


In [114]:
model_adagrad = keras.Sequential()
model_adagrad.add(keras.layers.Dense(16, 
                             activation = 'relu'))

model_adagrad.add(keras.layers.Dense(1, activation = 'sigmoid'))
model_adagrad.compile(optimizer='adagrad',
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [115]:
def create_dense_model(num_dense_layers, num_units):
    model = keras.Sequential()
    for _ in range(num_dense_layers):
        model.add(keras.layers.Dense(num_units, 
                                     activation = 'relu'))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [116]:
def create_simple_dropout_model(d):
    model = keras.Sequential()
    model.add(keras.layers.Dense(16, 
                                 activation = 'relu'))
    model.add(keras.layers.Dropout(0.2))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [117]:
def create_simple_conv_model(filters, kernel_size, strides):
    model = keras.Sequential()
    model.add(keras.layers.Reshape((1, 768)))
    model.add(keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, strides=strides, input_shape=(1, 768), padding='same'))
    model.add(keras.layers.Dense(16, 
                                 activation = 'relu'))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [118]:
def create_simple_avgpool_model(pool_size, strides):
    model = keras.Sequential()
    model.add(keras.layers.Reshape((1, 768)))
    model.add(keras.layers.AveragePooling1D(pool_size=pool_size, strides=strides, input_shape=(1, 768), padding='same'))
    model.add(keras.layers.Dense(16, 
                                 activation = 'relu'))

    model.add(keras.layers.Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    return model

In [119]:
def fit_and_score_model(model, model_name, epochs=20, i=0):
    
    start = time.time()
    model.fit(X_train, y_train, epochs=epochs)
    end = time.time()
    
    y_pred = model.predict(X_test)
    y_pred_bool = np.where(y_pred >= 0.5, 1, 0).ravel() #DIY function to round outputs to 0 or 1

    cr = classification_report(y_test, y_pred_bool, output_dict=True, digits=4)
    #print(confusion_matrix(y_test, y_pred_bool))
    
    scores = pd.DataFrame({'training_time': [end - start],
                           'precision': cr['weighted avg']['precision'],
                           'recall' : cr['weighted avg']['recall'],
                           'f1' : cr['weighted avg']['f1-score']}, index = [model_name])
    
    return scores

In [120]:
def bulk_evaluate_models(model_dict):
    
    results = pd.DataFrame()
    
    for model_name in model_dict:
        results = pd.concat([results, fit_and_score_model(model_dict[model_name], model_name)])
        
    return results

In [121]:
model_dict = {'baseline': create_dense_model(1, 16),
             'dense_1_64': create_dense_model(1, 64),
             'dense_1_256': create_dense_model(1, 256),
             'dense_2_64': create_dense_model(2, 64),
             'dense_2_256': create_dense_model(2, 256),
             'dropout_0.1': create_simple_dropout_model(0.1),
             'dropout_0.2': create_simple_dropout_model(0.2),
             'dropout_0.3': create_simple_dropout_model(0.3),
             'avgpool_3_None': create_simple_avgpool_model(3, None),
             'avgpool_5_None': create_simple_avgpool_model(5, None),
             'avgpool_7_None': create_simple_avgpool_model(7, None),
             'avgpool_3_1': create_simple_avgpool_model(3, 1),
             'avgpool_5_2': create_simple_avgpool_model(5, 2),
             'convolution_3_3_1': create_simple_conv_model(3, 3, 1),
             'convolution_5_5_1': create_simple_conv_model(5, 5, 1),
             'convolution_7_5_1': create_simple_conv_model(7, 5, 1),
             'convolution_5_7_1': create_simple_conv_model(5, 7, 1),
             'convolution_3_3_3': create_simple_conv_model(3, 3, 3),
             'optimizer_adagrad': model_adagrad}

model_performance = bulk_evaluate_models(model_dict)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20


Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [122]:
model_performance

Unnamed: 0,training_time,precision,recall,f1
baseline,41.353655,0.94837,0.948367,0.948366
dense_1_64,41.408797,0.959203,0.958823,0.958809
dense_1_256,36.514213,0.963183,0.962695,0.96268
dense_2_64,41.393924,0.958546,0.958306,0.958305
dense_2_256,51.394414,0.960759,0.960759,0.960759
dropout_0.1,41.379856,0.948756,0.948625,0.948617
dropout_0.2,41.408615,0.948629,0.948625,0.948624
dropout_0.3,41.446591,0.949027,0.94798,0.947938
avgpool_3_None,42.020118,0.94461,0.944366,0.944352
avgpool_5_None,41.507965,0.948557,0.948496,0.948497


In [123]:
model_peak = keras.Sequential()

model_peak.add(keras.layers.Reshape((1, 768)))
model_peak.add(keras.layers.AveragePooling1D(pool_size=3, strides=None, input_shape=(1, 768), padding='same'))

model.add(keras.layers.Dense(64, activation = 'relu'))
model_peak.add(keras.layers.Dropout(0.2))

model_peak.add(keras.layers.Dense(64, activation = 'relu'))
model_peak.add(keras.layers.Dropout(0.2))

model_peak.add(keras.layers.Dense(1, activation = 'sigmoid'))
model_peak.compile(optimizer='adam',
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [124]:
model_peak_performance = bulk_evaluate_models({'model_peak': model_peak})
model_peak_performance

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Unnamed: 0,training_time,precision,recall,f1
model_peak,41.537659,0.951114,0.950949,0.95094


## What Did the Model Get the Most Wrong

In [125]:
model = model_dict['dense_2_64']

In [126]:
y_pred = np.array([i for [i] in model.predict(X_test)])
y_deltas = y_pred - y_test.array
y_deltas_argsort = np.argsort(y_pred_deltas)

In [127]:
num_results = 10

print("Was CCP But Model Thought It Wasn't:")

for i in range(num_results):
    print("#" + str(i + 1))
    print(test.iloc[y_deltas_argsort[i]].text + '\n')
    
print("Was Not CCP But Model Thought It Was:")

for i in range(num_results):
    print("#" + str(i + 1))
    print(test.iloc[y_deltas_argsort[-(i+1)]].text + '\n')

Was CCP But Model Thought It Wasn't:
#1
@CGTNOfficial the APSI has long been receiving funds from the US government and arm dealers, and it deliberately smears, vilifies and demonizes China for the investors’ benefits. https://t.co/znZZytt7KW

#2
China is not the myth,but always reaching out hand to those in need.

#3
The final preview of Disney's "Mulan" debuts in the Super Bowl. The film tells the story of Mulan's march on the battlefield for his father and eventually becoming a legendary Chinese heroine heroine. https://t.co/d3CljPR3pX

#4
China, Russia and other authoritarian regimes are strengthening their grip with a new suite of high-tech products. As those spread, even second-tier tyrannies will benefit. https://t.co/yeb9r1oEuj

#5
@PDChina At least from this video,I don't think there is "genocide"
https://t.co/LKR5k9UOBW

#6
A visit to Hotan Night Market: How do #Xinjiang locals feel? https://t.co/aEs5StB6Ql GJMA

#7
@CryptoUB In 2015 i owned a lot of jordan sneakers and i wou