## Install dependencies

In [1]:
!pip install -q transformers datasets tensorflow accelerate

## Check GPU

In [2]:
import tensorflow as tf

gpu = tf.config.list_physical_devices('GPU')
if gpu:
    name = tf.config.experimental.get_device_details(gpu[0]).get("device_name","unknown")
    print("CUDA available: TRUE")
    print("GPU name: ",name)
else:
    print("CUDA available: FALSE")
    print("GPU: No GPU")


CUDA available: TRUE
GPU name:  Tesla T4


## 0. Import dependencies

In [3]:
from datasets import load_dataset
from transformers import BertTokenizer, TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import string

## 1. Load IMDb dataset

In [4]:
dataset = load_dataset("imdb")
train_ds = dataset["train"]
test_ds = dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

### convert to pandas

In [5]:
train_ds_pd = train_ds.to_pandas()
test_ds_pd = test_ds.to_pandas()

print("Train label distribution:\n", train_ds_pd['label'].value_counts())
print("Test label distribution:\n", test_ds_pd['label'].value_counts())

Train label distribution:
 label
0    12500
1    12500
Name: count, dtype: int64
Test label distribution:
 label
0    12500
1    12500
Name: count, dtype: int64


### preprocessing

In [6]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    return text.strip()
# bert model can understand url,puntuations
"""
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\d+', '', text)
    return re.sub(r'\s+', ' ', text)


df['cleaned_text'] = parallel_process(
    lambda chunk: chunk.apply(clean_text).values,
    df['Comment'],
    n_jobs=-1
)
"""

def preprocess_dataset(dataset):
    dataset = dataset.map(lambda x: {"text": clean_text(x["text"])})
    return dataset

train_ds = preprocess_dataset(train_ds)
test_ds = preprocess_dataset(test_ds)


  text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## 2. Tokenizer & Model

In [7]:
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Tokenization function

In [8]:
MAX_LENGTH = 256
BATCH_SIZE = 16

In [9]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

## 3.  Convert to TensorFlow dataset

In [10]:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")

def to_tf_dataset(dataset, shuffle=False):
    tokenized = dataset.map(
        tokenize,
        batched=True,
        remove_columns=["text"]
    )
    return tokenized.to_tf_dataset(
        columns=["input_ids", "attention_mask", "token_type_ids"],
        label_cols=["label"],
        shuffle=shuffle,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator
    )


In [11]:
tf_train = to_tf_dataset(train_ds, shuffle=True)
tf_test = to_tf_dataset(test_ds)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## 4. Build BERT model for classification

In [12]:
LEARNING_RATE = 2e-5

In [13]:
# Load pretrained BERT with a classification head
model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    from_pt=True
)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Compile the model

In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

## 5. Train the model

In [15]:
EPOCHS = 3  # 2-3 epochs are enough for IMDb
history = model.fit(
    tf_train,
    validation_data=tf_test,
    epochs=EPOCHS
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## 6. Evaluate the model

In [16]:
results = model.evaluate(tf_test)
print("Test loss:", results[0])
print("Test accuracy:", results[1])

Test loss: 0.2598564326763153
Test accuracy: 0.9164800047874451


In [17]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:

save_dir = "/content/drive/MyDrive/movie_senti"

# Save model (TensorFlow format)
model.save_pretrained(save_dir)

# Save tokenizer (same for both frameworks)
tokenizer.save_pretrained(save_dir)

# Verify files exist
!ls -lh {save_dir}

# priint if save or not
print("done and dusted")

total 419M
-rw------- 1 root root  610 Jan 22 17:38 config.json
-rw------- 1 root root  125 Jan 22 17:39 special_tokens_map.json
-rw------- 1 root root 418M Jan 22 17:39 tf_model.h5
-rw------- 1 root root 1.3K Jan 22 17:39 tokenizer_config.json
-rw------- 1 root root 227K Jan 22 17:39 vocab.txt
done and dusted
