In [None]:
pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

# Step 2: Data Preprocessing for BERT
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    all_stopwords = stopwords.words('english')
    important_words = ['not', 'no', 'never', 'very', 'too', 'don', 'isn', 'wasn', 'weren', 'aren', 'wouldn', 'couldn', 'shouldn']
    for word in important_words:
        if word in all_stopwords:
            all_stopwords.remove(word)
    review = [word for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

# Step 3: Prepare Labels (Make sure your labels are encoded if they're text-based)
# If your dataset labels are text like 'positive' and 'negative', use LabelEncoder
le = LabelEncoder()
y = le.fit_transform(dataset.iloc[:, -1].values)  # Assuming last column is the label

# Step 4: Tokenizing the data using BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing each review in the corpus
inputs = tokenizer(corpus, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Step 5: Split into Train and Test
X_train, X_test, y_train, y_test = train_test_split(inputs['input_ids'], y, test_size=0.20, random_state=0)

# Convert input data into PyTorch tensors
train_inputs = torch.tensor(X_train)
test_inputs = torch.tensor(X_test)
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

# Step 6: Create DataLoader for batching
train_data = TensorDataset(train_inputs, train_labels)
test_data = TensorDataset(test_inputs, test_labels)

train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)

# Step 7: Load Pre-trained BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # For binary classification

# Step 8: Fine-tuning BERT
from torch.optim import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Train the model
model.train()
for epoch in range(3):  # Train for 3 epochs
    for batch in train_dataloader:
        b_input_ids, b_labels = batch
        optimizer.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed with loss {loss.item()}")

# Step 9: Evaluate the Model
model.eval()
correct_predictions = 0
total_predictions = 0

for batch in test_dataloader:
    b_input_ids, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, labels=b_labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    correct_predictions += (predictions == b_labels).sum().item()
    total_predictions += b_labels.size(0)

accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy * 100:.2f}%")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

  train_inputs = torch.tensor(X_train)
  test_inputs = torch.tensor(X_test)
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1 completed with loss 0.5203778743743896
Epoch 2 completed with loss 0.22489041090011597
Epoch 3 completed with loss 0.19865086674690247
Accuracy: 90.00%


In [None]:
import re
import torch
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

class BERTSentimentClassifier:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.model.eval()

        # Set of stopwords excluding important ones
        self.stop_words = set(stopwords.words('english'))
        important_words = ['not', 'no', 'never', 'very', 'too', 'don', 'isn', 'wasn', 'weren', 'aren', 'wouldn', 'couldn', 'shouldn']
        for word in important_words:
            self.stop_words.discard(word)

    def preprocess(self, text):
        # Basic cleaning (similar to your training pipeline)
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower().split()
        text = [word for word in text if word not in self.stop_words]
        return ' '.join(text)

    def predict(self, text):
        cleaned_text = self.preprocess(text)

        inputs = self.tokenizer.encode_plus(
            cleaned_text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()

        return predicted_class  # 0 or 1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# After training is done and model/tokenizer are ready
bert_classifier = BERTSentimentClassifier(model, tokenizer)

# Predict on new examples
print(bert_classifier.predict("The food was amazing and the service was top-notch!"))  # → 1
print(bert_classifier.predict("Worst experience ever. Not coming back."))              # → 0


1
0
