In [None]:
# Step 1: Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
# Step 2: Load and prepare the dataset
data = pd.read_csv("updating_multiclass_dataset.csv")  # Replace with your actual file path

# Display the first few rows to check the data structure
print(data.head())\


# Map sentiments to integers (Adjust these mappings according to your dataset)
sentiment_mapping = {'positive': 2, 'negative': 0, 'neutral': 1}
data['Label'] = data['Sentiment'].map(sentiment_mapping)

# Check for NaN values in the 'Label' column
if data['Label'].isnull().any():
    print("NaN values found in labels. Please check your mapping.")
    print(data[data['Label'].isnull()])  # Print rows with NaN labels
else:
    print("All labels are mapped correctly.")

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

In [None]:
# Step 3: Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Step 4: Define the tokenization function
def tokenize_function(examples):
    # Tokenize the text and include the labels
    tokenized_output = tokenizer(examples['Text'], padding="max_length", truncation=True)
    tokenized_output['labels'] = examples['Label']  # Add labels to the tokenized output
    return tokenized_output

In [None]:
# Step 5: Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# Step 6: Check if labels are correctly added
print("Sample tokenized data:", tokenized_datasets[0])

In [None]:
# Step 7: Split the dataset into training and validation sets
train_testvalid = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_testvalid['train']
small_train_dataset = train_dataset.shuffle(seed=42).select([i for i in list(range(1000))])  # Use 1000 samples
valid_dataset = train_testvalid['test']
small_valid_dataset = valid_dataset.shuffle(seed=42).select([i for i in list(range(200))])  # Use 200 samples

In [None]:
# Step 8: Load the BERT model for sequence classification
num_labels = len(sentiment_mapping)  # Ensure num_labels corresponds to your label mapping
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

In [None]:
training_args = TrainingArguments(
    output_dir="C:\\Users\\Ansh Srivastava\\Desktop\\BERT-model",
    evaluation_strategy="epoch",  # Evaluate only at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Try a higher batch size
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,  # Log every 100 steps for efficiency
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate a larger batch
    fp16=True,  # Enable mixed precision training; disable if it slows down
)


In [None]:
# Step 10: Define Trainer with model, arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

In [None]:
# Step 11: Start training
try:
    trainer.train()
except Exception as e:
    print("Training failed with error:", str(e))

In [None]:
# Perform evaluation
metrics = trainer.evaluate()
print(metrics)

In [None]:
# Define a directory to save the fine-tuned model and tokenizer
save_directory = "fine_tuned_model"  # Replace with your desired save path

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

In [None]:
# Load the saved model and tokenizer to check if they were saved correctly
loaded_model = BertForSequenceClassification.from_pretrained(save_directory)
loaded_tokenizer = BertTokenizer.from_pretrained(save_directory)

# Print a confirmation
print("Model and tokenizer reloaded successfully from", save_directory)

In [None]:
# Step 1: Tokenize the test dataset (assuming you have a 'test_dataset' DataFrame with 'Text' and 'Label' columns)
test_dataset = Dataset.from_pandas(data)  # Replace `test_data` with your test DataFrame

# Tokenize the test dataset
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Step 2: Evaluate the model
metrics = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print("Evaluation metrics:", metrics)

# Extracting accuracy specifically
accuracy = metrics.get("eval_accuracy", "No accuracy metric found")
print("Model accuracy on test set:", accuracy)