In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import swifter
from sklearn.model_selection import train_test_split
import re
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

In [2]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
NUM_WORKERS = 8
TEST_SIZE = 0.3
LABEL_COLUMN_NAME = "oh_label"
TEXT_COLUMN_NAME = "Text"
DATASET_PATH = "../Data/ver1.csv"
df= pd.read_csv(DATASET_PATH, index_col=False)
df = df.dropna()
df.drop(["Unnamed: 0", "index"],axis=1, inplace = True)

In [4]:
from sklearn.model_selection import train_test_split
df = df.sample(frac=1).reset_index(drop=True)
X,y = df[TEXT_COLUMN_NAME],df[LABEL_COLUMN_NAME]
x_train,x_test, y_train,y_test = train_test_split(X,y, test_size=0.30, random_state=42)
x_test,x_val,y_test,y_val = train_test_split(x_test,y_test, test_size=0.50, random_state=42)

In [5]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                              text_a = x[DATA_COLUMN], 
                                                              text_b = None,
                                                              label = x[LABEL_COLUMN]), axis = 1)
  
    return train_InputExamples, validation_InputExamples

  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [6]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(pd.DataFrame({DATA_COLUMN:x_train[:5000],LABEL_COLUMN:y_train[:5000]}), pd.DataFrame({DATA_COLUMN:x_test,LABEL_COLUMN:y_test}), DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [7]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')],
              )

model.fit(train_data, epochs=3, validation_data=validation_data,use_multiprocessing=True,workers=8)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1b8a08cddc0>

In [8]:
model.save("bert_best")



INFO:tensorflow:Assets written to: bert_best\assets


INFO:tensorflow:Assets written to: bert_best\assets


3929     InputExample(guid=None, text_a=3929      USE U...
66365    InputExample(guid=None, text_a=3929      USE U...
9267     InputExample(guid=None, text_a=3929      USE U...
23724    InputExample(guid=None, text_a=3929      USE U...
77309    InputExample(guid=None, text_a=3929      USE U...
                               ...                        
92202    InputExample(guid=None, text_a=3929      USE U...
69141    InputExample(guid=None, text_a=3929      USE U...
97106    InputExample(guid=None, text_a=3929      USE U...
478      InputExample(guid=None, text_a=3929      USE U...
33121    InputExample(guid=None, text_a=3929      USE U...
Length: 15000, dtype: object

In [14]:
test = pd.DataFrame({DATA_COLUMN:x_test,LABEL_COLUMN:y_test})
test_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = test[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = test[LABEL_COLUMN]), axis = 1)
test_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer)

ValueError: Input 3929      USE US SMALL BRIAN 04ARNOLS STANDS FOR ANOLD ...
66365      == British Army ==  Yes, it's a hard one to ...
9267       ::: What BEHAVIOUR???? DEFENDING FROM YOUR A...
23724    RT @rkp12588: @YesYoureSexist @DaveGreene11 Ju...
77309    `  == stopfake.org ==  I'm not so sure this ca...
                               ...                        
92202     and even corroborated in the case of thew Dec...
69141    `  In the process you deleted 2 references/cit...
97106    ` :*Nominated here Template:Did you know nomin...
478      `  == Recap ==  I decided to withhold further ...
33121    @spanner77 @daniel_kaye there's a multitude of...
Name: DATA_COLUMN, Length: 15000, dtype: object is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [9]:
from sklearn.metrics import classification_report
pred = model.predict(x_test)
print(classification_report(y_test,pred))

TypeError: in user code:

    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py:1586 predict_function  *
        return step_function(self, iterator)
    C:\ProgramData\Anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:1747 call  *
        outputs = self.bert(
    C:\ProgramData\Anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:787 call  *
        embedding_output = self.embeddings(
    C:\ProgramData\Anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:190 call  *
        inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\dispatch.py:206 wrapper  **
        return target(*args, **kwargs)
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\ops\array_ops.py:5069 gather_v2
        return gather(
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\deprecation.py:549 new_func
        return func(*args, **kwargs)
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\ops\array_ops.py:5056 gather
        return params.sparse_read(indices, name=name)
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\ops\resource_variable_ops.py:713 sparse_read
        value = gen_resource_variable_ops.resource_gather(
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\ops\gen_resource_variable_ops.py:565 resource_gather
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\op_def_library.py:628 _apply_op_helper
        _SatisfiesTypeConstraint(base_type,
    C:\Users\yosef\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\op_def_library.py:59 _SatisfiesTypeConstraint
        raise TypeError(

    TypeError: Value passed to parameter 'indices' has DataType string not in list of allowed values: int32, int64
