In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
df = pd.read_csv('/content/drive/MyDrive/bangla_comments_tokenized.csv')

In [None]:
df.head()

In [None]:
# encoding the labels
df['label'] = df['label'].replace({'Political':0, 'religious':1,'sexual':2, 'not bully':3})

In [None]:
df.head()

In [None]:
!pip install datasets transformers

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


# Split the data into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.1, random_state=42)

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Print the DatasetDict
print(dataset_dict)

In [None]:
raw_datasets=dataset_dict

In [None]:
from transformers import AutoModelForPreTraining, AutoTokenizer

In [None]:
# Load the tokenizer for the Bagla BERT model
checkpoint = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
max_length = 128

In [None]:
# Display the first 5 examples in the 'clean_text' column
print(raw_datasets['train']['clean_text'][:5])  # If you have 'train' split

In [None]:
def tokenize_function(examples):
    if isinstance(examples["text"], list):
        examples["text"] = [str(text) for text in examples["text"]]
    else:
        examples["text"] = str(examples["text"])
    return tokenizer(examples["text"],padding='max_length', truncation=True,max_length=128,return_tensors='pt')

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

In [None]:
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)

In [None]:
!pip install torch-summary


In [None]:
from torchsummary import summary

In [None]:
# Print the model summary
summary(model, input_size=(1, 128), dtypes=["torch.long"])  # Adjust input size based on your tokenizer's max length

In [None]:
!pip install torchinfo


In [None]:
from torchinfo import summary

In [None]:
summary(model, input_size=(1, 128), dtypes=[torch.long])  # (batch_size, sequence_length)

In [None]:
!pip install pydot graphviz


In [None]:
!pip install torchviz

In [None]:
from torchviz import make_dot
import torch

# Assuming 'model' is your PyTorch model and 'tokenized_datasets' is your data
inputs = next(iter(tokenized_datasets['train']))  # Get a sample input
# Convert 'input_ids' and 'attention_mask' to tensors
input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0).to(device)  # Add batch dimension and move to device
attention_mask = torch.tensor(inputs['attention_mask']).unsqueeze(0).to(device)  # Add batch dimension and move to device

# Pass the input through the model to get the output
outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Create the visualization using make_dot
dot = make_dot(outputs.logits, params=dict(model.named_parameters()))

# Save or display the visualization
dot.render("model_structure", format="png")  # Save as PNG image
# Or, display the visualization in a Jupyter notebook:
# display(dot)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test set
test_results = trainer.predict(tokenized_datasets['test'])

# Extract predictions and true labels
predictions = test_results.predictions.argmax(axis=-1)
true_labels = test_results.label_ids

# Generate classification report with appropriate target names
target_names = ['Political', 'Religious', 'Sexual', 'Not Bully']
report = classification_report(true_labels, predictions, target_names=target_names)
print(report)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Generate confusion matrix
cm = confusion_matrix(true_labels, predictions)

# Class names based on your target labels
class_names = ['Political', 'Religious', 'Sexual', 'Not Bully']

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Extract the logs from the trainer
history = trainer.state.log_history

# Separate out training and validation loss
train_losses = [entry['loss'] for entry in history if 'loss' in entry]
eval_losses = [entry['eval_loss'] for entry in history if 'eval_loss' in entry]
eval_accuracies = [entry['eval_accuracy'] for entry in history if 'eval_accuracy' in entry]
# Create an epoch range for training loss
train_epochs = range(1, len(train_losses) + 1)

# Create an epoch range for validation loss - adjust frequency as needed
eval_epochs = range(1, len(eval_losses) + 1)
# Assuming eval_loss is calculated every epoch


# Plot Training and Validation Loss
plt.figure(figsize=(10, 6))
plt.plot(train_epochs, train_losses, label='Training Loss', marker='o')
plt.plot(eval_epochs, eval_losses, label='Validation Loss', marker='o')
# Use eval_epochs here
plt.xlabel('Epochs/Steps')  # Adjust label if needed
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid()
plt.show()



# Plot Validation Accuracy
plt.figure(figsize=(10, 6))
plt.plot(eval_epochs, eval_accuracies, label='Validation Accuracy', marker='o', color='green')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Validation Accuracy Curve')
plt.legend()
plt.grid()
plt.show()
