In [19]:
# Step 1: Import Libraries
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split


In [20]:
# Step 2: Load Custom Dataset CSV
df = pd.read_csv("custom_sentiment_dataset.csv")
print("Dataset preview:")
print(df.head())
print(f"\nTotal samples: {len(df)}")


Dataset preview:
                                                text  label
0                      I loved the new Batman movie!      1
1               The plot was boring and predictable.      0
2  A masterpiece. The acting, direction—everythin...      1
3         I wouldn’t recommend this movie to anyone.      0
4  Decent movie with a strong performance by the ...      1

Total samples: 15


In [21]:
# Step 3: Split Data into Training and Validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")


Training samples: 12
Validation samples: 3


In [22]:
# Step 4: Load BERT tokenizer and tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print("\nTokenizing training data...")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
print("Tokenizing validation data...")
val_encodings = tokenizer(val_texts, truncation=True, padding=True)





Tokenizing training data...
Tokenizing validation data...


In [23]:
# Step 5: Convert tokenized data to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))
print("\nDatasets ready for training.")



Datasets ready for training.


In [24]:
# Step 6: Load pretrained BERT model for sequence classification
print("Loading BERT model...")
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')


Loading BERT model...


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Step 7: Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)
print("Model compiled.")


Model compiled.


In [26]:
# Step 8: Train the model
print("\nStarting training...")
history = model.fit(
    train_dataset.shuffle(100).batch(8),
    epochs=10,
    batch_size=8,
    validation_data=val_dataset.batch(8)
)
print("Training complete.")



Starting training...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training complete.


In [27]:
# Step 9: Sentiment Analysis on New Reviews

# Sample new reviews
new_reviews = [
    "A truly positive story told beautifully.",
    "This was the worst film I've seen in years. Completely boring and predictable.",
    "It was not great but not terrible either."
]

# Tokenize new reviews
new_encodings = tokenizer(new_reviews, truncation=True, padding=True, return_tensors="tf")

# Predict with the model (returns logits)
outputs = model(new_encodings)

# Convert logits to probabilities (softmax)
probs = tf.nn.softmax(outputs.logits, axis=-1)

# Get predicted class (0 or 1)
predicted_classes = tf.argmax(probs, axis=1).numpy()

# Print predictions
for review, pred, prob in zip(new_reviews, predicted_classes, probs.numpy()):
    sentiment = "Positive" if pred == 1 else "Negative"
    confidence = prob[pred] * 100
    print(f"Review: {review}")
    print(f"Predicted sentiment: {sentiment} ({confidence:.2f}%)\n")


Review: A truly positive story told beautifully.
Predicted sentiment: Positive (60.57%)

Review: This was the worst film I've seen in years. Completely boring and predictable.
Predicted sentiment: Negative (74.54%)

Review: It was not great but not terrible either.
Predicted sentiment: Negative (75.68%)



ValueError: All arrays must be of the same length