In [1]:
# Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from torch.optim import AdamW
from tqdm import tqdm


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Load the IMDB dataset or any dataset of choice
#df = pd.read_csv('/content/drive/MyDrive/senitment_for_NIC/IMDB Dataset.csv')  # Assuming a CSV with 'review' and 'sentiment' columns
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/senitment_for_NIC/IMDB Dataset.csv')
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})




In [12]:
# Separate positive and negative samples
positive_samples = df[df['label'] == 1]
negative_samples = df[df['label'] == 0]

# Take 100 samples from each class
positive_sample = positive_samples.sample(n=40, random_state=42)
negative_sample = negative_samples.sample(n=40, random_state=42)

# Concatenate both samples to create a balanced sample
df2 = pd.concat([positive_sample, negative_sample])

# Shuffle the dataset to mix the classes
df2 = df2.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the new dataset
df2.head()

Unnamed: 0,review,sentiment,label
0,Du Rififi Chez Les Hommes/Rififi(1955) can on ...,positive,1
1,I don't know how or why this film has a meager...,positive,1
2,The theme is controversial and the depiction o...,positive,1
3,This is one of my favourite movies EVER... I h...,positive,1
4,I recently bought this movie on DVD at a disco...,positive,1


In [13]:
# Split data into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(df2['review'], df2['label'], test_size=0.2)

In [14]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the dataset
def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add [CLS] and [SEP]
            max_length=64,  # Max length of the sequence
            pad_to_max_length=True,  # Pad if shorter than max_length
            return_attention_mask=True,  # Generate attention mask
            return_tensors='pt'  # Return pytorch tensors
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels)





In [15]:
train_inputs, train_masks, train_labels = tokenize_data(train_texts, train_labels)
test_inputs, test_masks, test_labels = tokenize_data(test_texts, test_labels)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ValueError: could not determine the shape of object type 'Series'

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Create DataLoader for training and testing
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

# Loss function is implicitly handled within the model (CrossEntropyLoss for classification tasks)


In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    # Training loop
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch_input_ids, batch_masks, batch_labels = tuple(t.to(device) for t in batch)

        # Zero the gradients
        model.zero_grad()

        # Forward pass
        outputs = model(batch_input_ids, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {total_loss/len(train_dataloader)}')


In [None]:
# Set model to evaluation mode
model.eval()

# Tracking variables
predictions, true_labels = [], []

for batch in test_dataloader:
    batch_input_ids, batch_masks, batch_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_masks)

    logits = outputs.logits
    predictions.append(logits.detach().cpu().numpy())
    true_labels.append(batch_labels.cpu().numpy())

# Flatten outputs
predictions = np.argmax(np.vstack(predictions), axis=1)
true_labels = np.hstack(true_labels)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy}')
