In [26]:
!pip install transformers



In [42]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
import os
import shutil
import numpy as np


In [28]:
!pip install --upgrade --no-cache-dir gdown #Getting my data, and test from my google drive
! gdown --id 1wnkxnvgpn4nrGPtO4HP3dFykDd-c_LLV
! gdown --id 1HjrcuFwEMOTxpNbn_5WN7okElUJ8UVcs

Downloading...
From: https://drive.google.com/uc?id=1wnkxnvgpn4nrGPtO4HP3dFykDd-c_LLV
To: /content/data.npz
100% 30.2M/30.2M [00:00<00:00, 128MB/s]
Downloading...
From: https://drive.google.com/uc?id=1HjrcuFwEMOTxpNbn_5WN7okElUJ8UVcs
To: /content/test.npz
100% 29.5M/29.5M [00:00<00:00, 87.2MB/s]


In [29]:
import numpy as np
#loading my data and test files
data = np.load('data.npz')
test = np.load('test.npz')


In [30]:
list(data.keys())

['a', 'b']

In [31]:
from sklearn.model_selection import train_test_split

data_feat = data['a']
#The decimals kept showing, so had to change to int 
data_lab = data['b'].astype('float').astype('int')
#Making the dataframe
data = pd.DataFrame([data_feat, data_lab]).T
data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
data['DATA_COLUMN'] = data['DATA_COLUMN']
data.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,This is one of the better comedies that has ev...,0
1,Despite the overwhelming cult following for th...,1
2,"Kurt Russell, whose career started when he kic...",0
3,What a dog of a movie. Noni Hazelhurst's perfo...,1
4,Steve Carell stars as a person who you can rel...,0


In [32]:
from sklearn.model_selection import train_test_split
# we have to split the batches, having dependent(y) and indepedent(x)
x = data['DATA_COLUMN']
y = data['LABEL_COLUMN']
x_train, x_val, y_train, y_val = train_test_split( x, y, test_size=0.2, random_state=42)
#Making a dataframe 
train_data = pd.DataFrame([x_train, y_train]).T
train_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train_data['DATA_COLUMN'] = train_data['DATA_COLUMN']
train_data.head()




Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
23311,I rented this movie for a few laughs. I had ne...,1
23623,"Besides being boring, the scenes were oppressi...",1
1020,This definitely is NOT the intellectual film w...,0
12645,If you are going to attempt building tension i...,1
1533,"Fox's ""The True Story Of Jesse James"" (1957) i...",1


In [33]:
#making another dataframe 
val_data = pd.DataFrame([x_val, y_val]).T
val_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
val_data['DATA_COLUMN'] = val_data['DATA_COLUMN']
val_data.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
6868,I am amazed that movies like this can still be...,0
24016,"""Mad Dog Time""...""Trigger Happy"" whatever you ...",1
9668,We tend to forget that the master/slave contex...,0
13640,I read some previous comments stating that thi...,0
14018,"THE RUNNING MAN, along with TOTAL RECALL, is m...",0


In [34]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train_data.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = val_data.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [35]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [36]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, 
                                                                           val_data, 
                                                                           DATA_COLUMN, 
                                                                           LABEL_COLUMN)

In [37]:
# creating the bert models: using tiny, mini, small, medium and base case
model = TFBertForSequenceClassification.from_pretrained("google/bert_uncased_L-4_H-256_A-4", from_pt = True)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.summary()

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  11170560  
                                                                 
 dropout_27 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  514       
                                                                 
Total params: 11,171,074
Trainable params: 11,171,074
Non-trainable params: 0
_________________________________________________________________


In [38]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, val_data, DATA_COLUMN, LABEL_COLUMN)

training_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
training_data = training_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [39]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(training_data, epochs=1, validation_data=validation_data)



<keras.callbacks.History at 0x7fabfd2ee590>

In [40]:
test_feat = test['a'] 
test_lab = test['b'].astype('float').astype('int')

test_data = pd.DataFrame([test_feat, test_lab]).T
test_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test_data['DATA_COLUMN'] = test_data['DATA_COLUMN']
test_data.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,"I found this movie really hard to sit through,...",1
1,The movie starts off with Reeve (Ekin) and his...,1
2,I had a VERY hard time sitting through this fi...,1
3,"I'm not a big fan of musicals, but I was alway...",0
4,I honestly fail to understand why people love ...,1


In [44]:
#here we are getting our sentences and batches
sentences = np.array(test_data['DATA_COLUMN'].values.tolist())
batch = np.reshape(sentences, (-1, 50))

#making a for loop so they can go to every sentences and making an if-else statement within the for loop
for i in range(0,500):
  tf_batch = tokenizer(list(batch[i]), max_length=128, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = model(tf_batch)
  tf_sent = tf.nn.softmax(tf_outputs[0], axis=-1)
  labels = ['Negative','Positive']
  l = tf.argmax(tf_sent, axis=1)
  l = l.numpy()
  if (i==0): 
     weird_array = l
  else:   
     weird_array = np.concatenate((weird_array, l)) 


In [None]:
from sklearn.metrics import classification_report
actual_labels = np.array(test_data['LABEL_COLUMN'].values.tolist())
predicted_labels = weird_array

In [None]:
import sklearn.metrics as metrics
metrics.confusion_matrix(actual_labels, predicted_labels)

In [None]:
metrics.accuracy_score(actual_labels, predicted_labels)

In [None]:

print(classification_report(actual_labels, predicted_labels, labels=[1]))
