### Setup and Data Download

In [None]:
from google.colab import files
files.upload()  # Select the kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip install kaggle
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip imdb-dataset-of-50k-movie-reviews.zip

## Install Libraries

In [None]:
# Hugging Face Transformers ve Datasets
!pip install transformers datasets
# Other common libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

## Load and Prepare Data

In [None]:
df = pd.read_csv('IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})

# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

## Initialize Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example
sample = "I don't hate this movie"
tokens = tokenizer(sample, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
print(tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

{'input_ids': tensor([[ 101, 1045, 2123, 1005, 1056, 5223, 2023, 3185,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

## Create Custom Dataset

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

train_dataset = SentimentDataset(X_train, y_train, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

## Create DataLoaders

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


## Initialize Model and Optimizer

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)


## Train Model

In [None]:
epochs = 3
progress_bar = tqdm(range(epochs * len(train_loader)))

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 7500/7500 [41:42<00:00,  3.02it/s]

## Evaluate Model

In [None]:
model.eval()
preds = []
labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        preds.extend(predictions.cpu().numpy())
        labels.extend(batch['labels'].cpu().numpy())

from sklearn.metrics import classification_report
print(classification_report(labels, preds))


              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



## Create Prediction Function

In [None]:
def predict_bert(text):
    model.eval()
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=64, return_tensors='pt')
    encoding = {k:v.to(device) for k,v in encoding.items()}
    with torch.no_grad():
        outputs = model(**encoding)
        pred = torch.argmax(outputs.logits, dim=-1).item()
    return "positive" if pred==1 else "negative"

# Test examples
print(predict_bert("The movie wasn’t so bad, actually I enjoyed it a lot"))  # positive
print(predict_bert("I don't hate it")) # positive
print(predict_bert("I hated this movie")) # negative
print(predict_bert("I dont like it or ı dont hate it")) # negative

positive
positive
negative
negative


## Save Model to Drive

In [None]:
import os

model_save_path = '/content/drive/My Drive/bert_sentiment_model.pth'

save_dir = os.path.dirname(model_save_path)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

print(f"Model successfully saved: {model_save_path}")

## Load Model and Predict with Loaded Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from transformers import BertForSequenceClassification
import os

model_load_path = '/content/drive/My Drive/bert_sentiment_model.pth'

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define the model architecture (must be the same as used for training)
# num_labels=2, because we have 2 classes for sentiment analysis (positive/negative)
loaded_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load the saved state dictionary
loaded_model.load_state_dict(torch.load(model_load_path, map_location=device))

loaded_model.to(device)

# Set the model to evaluation mode
loaded_model.eval()

print(f"Model successfully loaded: {model_load_path}")

In [None]:
from transformers import BertTokenizer

# Must be the same tokenizer used for training
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict_sentiment(text, model, tokenizer, device, max_len=128):
    model.eval()
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
    encoding = {k:v.to(device) for k,v in encoding.items()}
    with torch.no_grad():
        outputs = model(**encoding)
        pred = torch.argmax(outputs.logits, dim=-1).item()
    return "positive" if pred==1 else "negative"

# Test with new examples
print(f"Text: 'ı don't like it!' -> Prediction: {predict_sentiment('I dont like it!', loaded_model, tokenizer, device)}")
print(f"Text: 'I absolutely hated it.' -> Prediction: {predict_sentiment('I absolutely hated it.', loaded_model, tokenizer, device)}")
print(f"Text: 'This movie was awesome.' -> Prediction: {predict_sentiment('This movie was awesome.', loaded_model, tokenizer, device)}")
print(f"Text: 'The acting was great, but the story was weak.' -> Prediction: {predict_sentiment('The acting was great, but the story was weak.', loaded_model, tokenizer, device)}")

Text: 'ı don't like it!' -> Prediction: negative
Text: 'I absolutely hated it.' -> Prediction: negative
Text: 'This movie was awesome.' -> Prediction: positive
Text: 'The acting was great, but the story was weak.' -> Prediction: negative
