In [32]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from transformers import TFBertModel,TFBertForSequenceClassification,BertTokenizerFast,BertForSequenceClassification
import keras
from transformers import create_optimizer
import pandas as pd
import string
from keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

In [33]:
VOCAB_SIZE=1000
SEQUENCE_LENGTH = 512
BATCH_SIZE=1000

In [34]:
data = pd.read_csv("G:\Ajay\dataset\IMDB Movie dataset\IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [35]:
def cleanup_text(text):
    text=text.translate(str.maketrans('', '',string.punctuation))
    return text.lower()

def change_sentiment(sentiment):
    if sentiment == "positive":
       return 1
    else:
        return 0

In [36]:
data["review"]=data["review"].apply(lambda x:cleanup_text(x))
data["sentiment"]=data["sentiment"].apply(lambda x:change_sentiment(x))

In [37]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [38]:
def preprocess_function(examples):
    return tokenizer(examples['text'],padding=True,max_length=512,truncation=True,return_tensors="tf")

In [39]:
X = data["review"]
Y = data["sentiment"]
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=0,stratify=Y)

In [40]:
d = {'train':Dataset.from_dict({'text':x_train.tolist(),'label':y_train.tolist()}),
    'test':Dataset.from_dict({'text':x_test.tolist(),'label':y_test.tolist()})}
dataset=DatasetDict(d)

In [41]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})

In [42]:
dataset=dataset.map(preprocess_function,batched=True)

Map: 100%|██████████████████████████████████████████████████████████████| 40000/40000 [00:08<00:00, 4635.88 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4610.02 examples/s]


In [43]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [44]:
tf_train_dataset = dataset["train"].to_tf_dataset(
    columns=["input_ids","token_type_ids","attention_mask","label"],
    shuffle=True,
    batch_size=10
)

tf_val_dataset = dataset["test"].to_tf_dataset(
    columns=["input_ids","token_type_ids","attention_mask","label"],
    shuffle=True,
    batch_size=10
)

In [45]:
def swap_positions(dataset):
    return {
        "input_ids":dataset["input_ids"],
        "token_type_ids":dataset["token_type_ids"],
        "attention_mask":dataset["attention_mask"],
        "labels":dataset["label"]
    }

In [46]:
tf_train_dataset = tf_train_dataset.map(swap_positions)
tf_val_dataset = tf_val_dataset.map(swap_positions)

In [47]:
num_epochs = 3
BATCH_SIZE = 4
batch_per_epoch = int(len(dataset["train"])//BATCH_SIZE)
total_train_steps = int(batch_per_epoch*num_epochs)
print("num_epochs :{}".format(num_epochs))
print("batch_per_epoch :{}".format(batch_per_epoch))
print("total_train_steps :{}".format(total_train_steps))
opt,scheduler = create_optimizer(init_lr=2e-5,num_train_steps=total_train_steps,num_warmup_steps=0)

num_epochs :3
batch_per_epoch :10000
total_train_steps :30000


In [48]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)
model.compile(optimizer=opt,metrics=["accuracy"])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
model.fit(tf_train_dataset,validation_data=tf_val_dataset,epochs=2,batch_size=32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x178d1ec1460>

In [50]:
model.save_pretrained("./custom_bert")

In [51]:
# model = BertModel.from_pretrained("./custom_bert")
model = TFBertForSequenceClassification.from_pretrained("./custom_bert",num_labels=2)

Some layers from the model checkpoint at ./custom_bert were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./custom_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [54]:
class_labels = ["negetive","positive"]
review_text = ["This movie looks very integresting and a great job","Movie is too length and not at all good"]
inputs =tokenizer(review_text,padding=True,return_tensors="tf")
logits = model(**inputs).logits
result = np.argmax(logits,axis=1)
for i,val in enumerate(result):
    print("review is -> {}".format(class_labels[val]))

review is -> positive
review is -> negetive
