In [12]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures


In [13]:
!pip install transformers 

  pid, fd = os.forkpty()




In [14]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/IMDB Dataset.csv


In [15]:
df=pd.read_csv("/kaggle/input/IMDB Dataset.csv")
df.sample()

Unnamed: 0,review,sentiment
33728,We have reached the ceiling of implausibility ...,negative


In [16]:
def cat2num(value):
    if value=='positive': 
        return 1
    else: 
        return 0
    
df['sentiment']  =  df['sentiment'].apply(cat2num)

In [17]:
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['sentiment'])

In [18]:
# Display dataset sizes
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(validation_data)}")
print(f"Test size: {len(test_data)}")

Train size: 40000
Validation size: 5000
Test size: 5000


In [19]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.summary()

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# Convert DataFrame into InputExamples
def convert_data_to_examples(data, review_col, sentiment_col): 
    return data.apply(lambda x: InputExample(
        guid=None, text_a=x[review_col], label=x[sentiment_col]), axis=1)

# Create InputExamples for train, validation, and test
train_examples = convert_data_to_examples(train_data, 'review', 'sentiment')
validation_examples = convert_data_to_examples(validation_data, 'review', 'sentiment')
test_examples = convert_data_to_examples(test_data, 'review', 'sentiment')

In [21]:
# Convert InputExamples to TensorFlow datasets
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True,
            truncation=True
        )

        features.append(InputFeatures(
            input_ids=input_dict["input_ids"], 
            attention_mask=input_dict["attention_mask"], 
            token_type_ids=input_dict["token_type_ids"], 
            label=e.label
        ))

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )
    return tf.data.Dataset.from_generator(
        gen,
        ({ "input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32 }, tf.int64),
        ({
            "input_ids": tf.TensorShape([None]),
            "attention_mask": tf.TensorShape([None]),
            "token_type_ids": tf.TensorShape([None]),
        },
        tf.TensorShape([])),
    )

In [28]:
# Prepare TensorFlow datasets
train_tf_data = convert_examples_to_tf_dataset(list(train_examples), tokenizer)
train_tf_data = train_tf_data.shuffle(100).batch(32).repeat(2)

validation_tf_data = convert_examples_to_tf_dataset(list(validation_examples), tokenizer)
validation_tf_data = validation_tf_data.batch(32)

test_tf_data = convert_examples_to_tf_dataset(list(test_examples), tokenizer)
test_tf_data = test_tf_data.batch(32)

100%|██████████| 40000/40000 [02:58<00:00, 224.55it/s]
100%|██████████| 5000/5000 [00:21<00:00, 227.44it/s]
100%|██████████| 5000/5000 [00:22<00:00, 226.81it/s]


In [29]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
)

In [30]:
steps_per_epoch = len(train_data) // 32  # Total samples divided by batch size
model.fit(
    train_tf_data,
    validation_data=validation_tf_data,
    epochs=3,
    steps_per_epoch=steps_per_epoch
)
# Evaluate the model on the test dataset
results = model.evaluate(test_tf_data)
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Loss: 0.34519365429878235
Test Accuracy: 0.895799994468689


In [31]:
# Perform inference on custom sentences
pred_sentences = [
    'worst movie of my life, will never watch movies from this series', 
    'Wow, blew my mind, what a movie by Marvel, animation and story is amazing',
    "The movie was fantastic! The characters and plot were very well developed.",
    "What a waste of time. The story was so boring and predictable."
]

# Tokenize input sentences
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')

# Generate predictions
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)

# Convert predictions to labels
labels = ['Negative', 'Positive']
predicted_labels = tf.argmax(tf_predictions, axis=1).numpy()

# Display predictions
for i, sentence in enumerate(pred_sentences):
    print(f"'{sentence}' => Sentiment: {labels[predicted_labels[i]]}")

'worst movie of my life, will never watch movies from this series' => Sentiment: Negative
'Wow, blew my mind, what a movie by Marvel, animation and story is amazing' => Sentiment: Positive
'The movie was fantastic! The characters and plot were very well developed.' => Sentiment: Positive
'What a waste of time. The story was so boring and predictable.' => Sentiment: Negative
