### Environment Setting

python 3.7
pip install tensorflow==1.15
pip install --upgrade tensorflow-hub
pip install bert-tensorflow==1.0.1

# BERT model builder

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pickle
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
import random
random.seed(594)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context




In [2]:
def pretty_print(result):
    df = pd.DataFrame([result]).T
    df.columns = ["values"]
    return df

In [3]:
def create_tokenizer_from_hub_module(bert_model_hub):
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(bert_model_hub)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])

  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

def make_features(dataset, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN):
    input_example = dataset.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                   text_a = x[DATA_COLUMN],
                                                                   text_b = None,
                                                                   label = x[LABEL_COLUMN]), axis = 1)
    features = bert.run_classifier.convert_examples_to_features(input_example, label_list, MAX_SEQ_LENGTH, tokenizer)
    return features

def create_model(bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      bert_model_hub,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(bert_model_hub, num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics.
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        # f1_score = tf.contrib.metrics.f1_score(
        #     label_ids,
        #     predicted_labels)
        # auc = tf.metrics.auc(
        #     label_ids,
        #     predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels)
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "eval_accuracy": accuracy,
            # "f1_score": f1_score,
            # "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

def estimator_builder(bert_model_hub, OUTPUT_DIR, SAVE_SUMMARY_STEPS, SAVE_CHECKPOINTS_STEPS, label_list, LEARNING_RATE, num_train_steps, num_warmup_steps, LOG_STEP_COUNT_STEPS, BATCH_SIZE):

    # Specify outpit directory and number of checkpoint steps to save
    run_config = tf.estimator.RunConfig(
        model_dir=OUTPUT_DIR,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        log_step_count_steps=LOG_STEP_COUNT_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    model_fn = model_fn_builder(
      bert_model_hub = bert_model_hub,
      num_labels=len(label_list),
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE})
    return estimator, model_fn, run_config

In [4]:
def run_on_dfs(train, test, DATA_COLUMN, LABEL_COLUMN,
               MAX_SEQ_LENGTH = 128,
              BATCH_SIZE = 32,
              LEARNING_RATE = 2e-5,
              NUM_TRAIN_EPOCHS = 3.0,
              WARMUP_PROPORTION = 0.1,
              SAVE_SUMMARY_STEPS = 100,
                SAVE_CHECKPOINTS_STEPS = 10000,
                LOG_STEP_COUNT_STEPS = 100,
              bert_model_hub = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"):

    label_list = train[LABEL_COLUMN].unique().tolist()

    tokenizer = create_tokenizer_from_hub_module(bert_model_hub)

    train_features = make_features(train, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)
    test_features = make_features(test, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)

    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    estimator, model_fn, run_config = estimator_builder(
                                  bert_model_hub,
                                  OUTPUT_DIR,
                                  SAVE_SUMMARY_STEPS,
                                  SAVE_CHECKPOINTS_STEPS,
                                  label_list,
                                  LEARNING_RATE,
                                  num_train_steps,
                                  num_warmup_steps,
                                  LOG_STEP_COUNT_STEPS,
                                  BATCH_SIZE)

    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)

    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    test_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)

    result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)
    return result_dict, estimator, test_input_fn, label_list

# Load the dataset

In [5]:
# load the dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, f1_score
features_title = np.load('./TrainData/features_title.npy', allow_pickle=True)
targets_title = np.load('./TrainData/targets_title.npy', allow_pickle=True)
features_body = np.load('./TrainData/features_body.npy', allow_pickle=True)
targets_body = np.load('./TrainData/targets_body.npy', allow_pickle=True)
df_title = pd.DataFrame({'label': targets_title, 'text':features_title})
df_body = pd.DataFrame({'label': targets_body,'text':features_body})

In [6]:
df_title.head()

Unnamed: 0,label,text
0,-1,itnot about the money itabout sendingmessage
1,-1,math professor scott steiner says the numbers ...
2,-1,exit the system
3,-1,new sec filing for gme can someone less retard...
4,-1,not to distract from gme just thought our amc ...


In [7]:
df_body.head()

Unnamed: 0,label,text
0,-1,the ceo of nasdaq pushed to halt trading to gi...
1,-1,hedgefund whales are spreading disinfo saying ...
2,-1,life isnfair my mother always told me that whe...
3,-1,i believe right now is one of those rare oppor...
4,-1,you guys are champs gme who would have thought...


In [9]:
# df_all = pd.DataFrame
# df_all['text'] = df_title['text']+df_body['text']
# df_all['label'] = df_title['label']
# # df_all = df_all.sample(frac=1)
# df_all.head()

TypeError: 'type' object does not support item assignment

In [8]:
myparam = {
        "DATA_COLUMN": "text",
        "LABEL_COLUMN": "label",
        "MAX_SEQ_LENGTH": 30,
        "LEARNING_RATE": 2e-5,
        "SAVE_SUMMARY_STEPS":25,
        "NUM_TRAIN_EPOCHS":3,
        "BATCH_SIZE": 16,
        "LOG_STEP_COUNT_STEPS": 100,
        "bert_model_hub":"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
#         "bert_model_hub":"https://tfhub.dev/google/bert_uncased_L-24_H-1024_A-16/1"
#         "bert_model_hub":"https://tfhub.dev/digitalepidemiologylab/covid-twitter-bert/2"
    }

# Fine-tuning
### Training: Title dataset

In [9]:
OUTPUT_DIR = 'output_title'
train_title, test_title = train_test_split(df_title, test_size=0.2, random_state=42)
result_title, estimator_title, test_input_fn_title, label_list_title = run_on_dfs(train_title, test_title, **myparam)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Writing example 0 of 28588
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] dog ##ec ##oin [SEP]
INFO:tensorflow:input_ids: 101 3899 8586 28765 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: -1 (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] first time better long time lu ##rke ##r turning that co ##vid st ##im ##y into straight rocket fuel [SEP]
INFO:tensorflow:input_ids: 101 2034 2051 2488 2146 2051 11320 25074 2099 3810 2008 2522 17258 2358 5714 2100 2046 3442 7596 4762 102 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
INFO:ten

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Writing example 0 of 28588
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] dog ##ec ##oin [SEP]
INFO:tensorflow:input_ids: 101 3899 8586 28765 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: -1 (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] first time better long time lu ##rke ##r turning that co ##vid st ##im ##y into straight rocket fuel [SEP]
INFO:tensorflow:input_ids: 101 2034 2051 2488 2146 2051 11320 25074 2099 3810 2008 2522 17258 2358 5714 2100 2046 3442 7596 4762 102 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
INFO:ten

In [10]:
pretty_print(result_title)

Unnamed: 0,values
eval_accuracy,0.752763
false_negatives,1031.0
false_positives,736.0
loss,0.812335
precision,0.396226
recall,0.319022
true_negatives,4897.0
true_positives,483.0
global_step,5360.0


### Training: Body dataset

In [11]:
OUTPUT_DIR = 'output_body'
train_body, test_body = train_test_split(df_body, test_size=0.2, random_state=42)
result_body, estimator_body, test_input_fn_body, label_list_body = run_on_dfs(train_body, test_body, **myparam)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Writing example 0 of 13677
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] don ##let em do it to you r ##h should be shut down ##ne ##ver touched the app ##use real broker ##ages from the start but ##bet your [SEP]
INFO:tensorflow:input_ids: 101 2123 7485 7861 2079 2009 2000 2017 1054 2232 2323 2022 3844 2091 2638 6299 5028 1996 10439 8557 2613 20138 13923 2013 1996 2707 2021 20915 2115 102
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: -1 (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] buy gm ##e and amc now is our time rise up and buy [SEP]
INFO:tensorflow:input_ids: 101 4965 13938 2063 1998 21962 2085 2003 2256 2051 4125 2039 1998 4965 102 

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Writing example 0 of 13677
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] don ##let em do it to you r ##h should be shut down ##ne ##ver touched the app ##use real broker ##ages from the start but ##bet your [SEP]
INFO:tensorflow:input_ids: 101 2123 7485 7861 2079 2009 2000 2017 1054 2232 2323 2022 3844 2091 2638 6299 5028 1996 10439 8557 2613 20138 13923 2013 1996 2707 2021 20915 2115 102
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: -1 (id = 0)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] buy gm ##e and amc now is our time rise up and buy [SEP]
INFO:tensorflow:input_ids: 101 4965 13938 2063 1998 21962 2085 2003 2256 2051 4125 2039 1998 4965 102 

In [12]:
pretty_print(result_body)

Unnamed: 0,values
eval_accuracy,0.775731
false_negatives,486.0
false_positives,281.0
loss,0.794941
precision,0.458574
recall,0.328729
true_negatives,2415.0
true_positives,238.0
global_step,2564.0


### Training: All dataset

In [None]:
# OUTPUT_DIR = 'output_all'
# train_all, test_all = train_test_split(df_all, test_size=0.2, random_state=42)
# result_all, estimator_all, test_input_fn_all, label_list_all = run_on_dfs(train_all, test_all, **myparam)

In [None]:
# pretty_print(result_all)

# Evaluation
### Evaluation: Title dataset

In [13]:
def validate(result, estimator, test_input_fn, label_list, train, test):
    Y_pred = []
    Y_test = test['label'].tolist()
    pred_tf = estimator.predict(test_input_fn)
    for (i, prediction) in enumerate(pred_tf):
        probabilities = prediction["probabilities"]
        label = label_list[prediction["labels"]]
        Y_pred.append(label)
    from sklearn.metrics import classification_report, accuracy_score
    print(accuracy_score(np.array(Y_pred), np.array(Y_test)))
    print(classification_report(np.array(Y_pred), np.array(Y_test), digits=4))

In [14]:
validate(result_title, estimator_title, test_input_fn_title, label_list_title, train_title, test_title)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from output_title\model.ckpt-5360
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
0.7529033160766755
              precision    recall  f1-score   support

          -1     0.8695    0.8261    0.8473      5929
           1     0.3190    0.3966    0.3536      1218

    accuracy                         0.7529      7147
   macro avg     0.5943    0.6113    0.6004      7147
weighted avg     0.7757    0.7529    0.7631      7147



INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from output_title\model.ckpt-5360
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


### Evaluation: Body dataset

In [15]:
validate(result_body, estimator_body, test_input_fn_body, label_list_body, train_body, test_body)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from output_body\model.ckpt-2564
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
0.7751461988304094
              precision    recall  f1-score   support

          -1     0.8961    0.8317    0.8627      2905
           1     0.3246    0.4563    0.3793       515

    accuracy                         0.7751      3420
   macro avg     0.6104    0.6440    0.6210      3420
weighted avg     0.8101    0.7751    0.7899      3420



INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from output_body\model.ckpt-2564
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


### Evaluation: All dataset

In [None]:
# validate(result_all, estimator_all, test_input_fn_title, label_list_all, train_all, test_title)

In [None]:
# validate(result_all, estimator_all, test_input_fn_body, label_list_all, train_all, test_body)

In [None]:
# validate(result_all, estimator_all, test_input_fn_all, label_list_all, train_all, test_all)

# Prediction

In [None]:
# load test dataset
title_test_cleanall = np.load('./TestData/features_title.npy', allow_pickle=True)
body_test_cleanall = np.load('./TestData/features_body.npy', allow_pickle=True)
title_dummy = []
for i in range(len(title_test_cleanall)):
    title_dummy.append(random.randint(-1, 1))
body_dummy = []
for i in range(len(body_test_cleanall)):
    body_dummy.append(random.randint(-1, 1))
df_title_test = pd.DataFrame({'label':title_dummy, 'text':title_test_cleanall})
df_body_test = pd.DataFrame({'label':body_dummy, 'text':body_test_cleanall})

In [None]:
# df_title_test.head(30)

In [None]:
def feature_predict_label(test, estimator, MAX_SEQ_LENGTH, label_list_train):
    tokenizer = create_tokenizer_from_hub_module("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1")
    input_example = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                   text_a = x["text"],
                                                                   text_b = None,
                                                                   label = 0), axis = 1)
    label_list = test["label"].unique().tolist()
    input_features = bert.run_classifier.convert_examples_to_features(input_example, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    pred_tf = estimator.predict(predict_input_fn)

    Y_pred=[]
    Y_prob=[]
    for (i, prediction) in enumerate(pred_tf):
        probabilities = prediction["probabilities"]
        labels = label_list_train[prediction["labels"]]
        Y_pred.append(labels)
        Y_prob.append(probabilities)
    return Y_pred, Y_prob

### Predict Title test dataset and save the prediction

In [None]:
# # Predict Obama test dataset with "title" Model
# Y_title_pred = feature_predict_label(df_title_test, estimator_title, 30, label_list_title)
# with open('obama.txt', 'w') as f:
#     f.write("67_76_Charlie_Wang_Chieh-Hsi_Lin\n")
#     for index, pred in enumerate(Y_title_pred[0]):
#         f.write("{};;{}\n".format(index+1, pred))
# f.close()
#
# # Predict Obama test dataset with "All" Model
# Y_title_pred_all = feature_predict_label(df_title_test, estimator_all, 30, label_list_all)
# with open('obama_all.txt', 'w') as f:
#     for index, pred in enumerate(Y_title_pred_all[0]):
#         f.write("{};;{}\n".format(index+1, pred))
# f.close()

In [None]:
len(Y_title_pred[0])

In [None]:
# print(accuracy_score(np.array(Y_title_pred[0]), np.array(df_title.label.values)))

### Predict Body test dataset and save the prediction

In [None]:
# # Predict Body test dataset with "Body" Model
# Y_body_pred = feature_predict_label(df_body_test, estimator_body, 30, label_list_body)
# with open('body.txt', 'w') as f:
#     f.write("67_76_Charlie_Wang_Chieh-Hsi_Lin\n")
#     for index, pred in enumerate(Y_body_pred[0]):
#         f.write("{},,{}\n".format(index+1, pred))
# f.close()
#
# # Predict Romney test dataset with "All" Model
# Y_body_pred_all = feature_predict_label(df_body_test, estimator_all, 30, label_list_all)
# with open('romney_all.txt', 'w') as f:
#     for index, pred in enumerate(Y_body_pred_all[0]):
#         f.write("{},,{}\n".format(index+1, pred))
# f.close()

In [None]:
len(Y_body_pred[0])

In [None]:
print(accuracy_score(np.array(Y_body_pred[0]), np.array(df_body.label.values)))

## utilities

In [None]:
# pip install "tensorflow>=1.15,<2.0"
# pip install --upgrade tensorflow-hub
# pip install bert-tensorflow==1.0.1
print(label_list_title)
print(label_list_body)
# print(label_list_all)

In [None]:
# df_title["label"].unique().tolist()
import os
from tensorflow.python.client import device_lib
os.environ["CUDA_VISIBLE_DEVICES"] = ""
print(device_lib.list_local_devices())
import tensorflow as tf
print(tf.__version__)

In [None]:
# Y_pred = []
# Y_test = test['label'].tolist()
# pred_tf = estimator.predict(test_input_fn)
# label_dict = {"0":"0", "1":"1", "2":"-1"}
# target_names = ['class 0', 'class 1', 'class 2']
# for (i, prediction) in enumerate(pred_tf):
#     probabilities = prediction["probabilities"]
#     label = label_dict[str(prediction["labels"])]
#     Y_pred.append(label)
# from sklearn.metrics import classification_report, accuracy_score
# print(accuracy_score(np.array(Y_pred), np.array(Y_test)))
# print(classification_report(np.array(Y_pred), np.array(Y_test), digits=4))

In [None]:
# train, test = train_test_split(df_title, test_size=0.2, random_state=583)

In [None]:
# tokenizer = create_tokenizer_from_hub_module("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1")
# test_features = make_features(test, label_list, 30, tokenizer, DATA_COLUMN="text", LABEL_COLUMN="label")
# test_input_fn = run_classifier.input_fn_builder(
#         features=test_features,
#         seq_length=30,
#         is_training=False,
#         drop_remainder=False)

In [None]:
# Y_pred = []
# Y_test = test['label'].tolist()
# pred_tf = estimator.predict(test_input_fn)
# target_names = ['class 0', 'class 1', 'class 2']
# for (i, prediction) in enumerate(pred_tf):
#     probabilities = prediction["probabilities"]
#     label = label_list[prediction["labels"]]
#     Y_pred.append(label)
# from sklearn.metrics import classification_report, accuracy_score
# print(accuracy_score(np.array(Y_pred), np.array(Y_test)))
# print(classification_report(np.array(Y_pred), np.array(Y_test), digits=4))

In [None]:
# label_list
def save_dataset(targets, tar_name):
    print('Saving training dataset...')

    #Create Saving Files
    if not os.path.exists('/TrainData'):
        os.makedirs('/TrainData')
    np.save('C:/Users/sluge/Desktop/CS583/CS583_ResearchProject/TrainData/' + tar_name + '.npy', targets)

    print('Saved parsed dataset')


In [None]:
save_dataset(Y_title_pred[0], "BERT_title_result")
save_dataset(Y_body_pred[0], "BERT_body_result")
save_dataset(Y_title_pred_all[0], "BERT_title_all_result")
save_dataset(Y_body_pred_all[0], "BERT_body_all_result")


In [None]:
body_svc_result = np.load('./TrainData/SVC_body_result.npy', allow_pickle=True)
print(len(body_svc_result))

print(accuracy_score(Y_body_pred[0], body_svc_result))
print(classification_report(Y_body_pred[0], body_svc_result))

In [None]:
title_svc_result = np.load('./TrainData/SVC_title_result.npy', allow_pickle=True)
print(len(title_svc_result))

print(accuracy_score(Y_title_pred[0], title_svc_result))
print(classification_report(Y_title_pred[0], title_svc_result))