In [None]:
import csv
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt


In [None]:
file_path = r"C:\Users\shuvo\TEXT FILE\AUGMENTED_2_texts_with_labels.csv"
data = pd.read_csv(file_path)
#data['class'] = data['class'].apply(lambda x: 1 if x == 'suicide' else 0)
#data = data.drop(data.columns[0], axis=1)
#data = data.head(500)


In [None]:
print(data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encoded = tokenizer.batch_encode_plus(
    X_train.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=128,
    return_tensors='pt'
)

test_encoded = tokenizer.batch_encode_plus(
    X_test.tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=128,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
train_inputs = train_encoded['input_ids']
train_masks = train_encoded['attention_mask']
train_labels = torch.tensor(y_train.values)

test_inputs = test_encoded['input_ids']
test_masks = test_encoded['attention_mask']
test_labels = torch.tensor(y_test.values)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            _, predicted_labels = torch.max(logits, 1)
            correct_predictions += torch.sum(predicted_labels == inputs['labels']).item()
            total_predictions += inputs['labels'].size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

# Training loop
num_epochs = 30
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for epoch in range(num_epochs):
    model.train()
    total_loss, correct_train_predictions, total_train_predictions = 0, 0, 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
        optimizer.zero_grad()

        # Unpack the batch
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted_labels = torch.max(outputs.logits, 1)
        correct_train_predictions += torch.sum(predicted_labels == labels).item()
        total_train_predictions += labels.size(0)

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_train_predictions / total_train_predictions

    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy)

    # Validation
    val_loss, val_accuracy = evaluate(model, test_dataloader)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot training and validation losses
axes[0].plot(train_losses, label='Training Loss', color='blue')
axes[0].plot(val_losses, label='Validation Loss', color='orange')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Losses')
axes[0].legend()

# Plot training and validation accuracies
axes[1].plot(train_accuracies, label='Training Accuracy', color='blue')
axes[1].plot(val_accuracies, label='Validation Accuracy', color='orange')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Validation Accuracies')
axes[1].legend()

# Adjust layout
plt.tight_layout()

# Show plots
plt.show()


In [None]:
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Initialize lists to store true labels and predicted labels
true_labels = []
predicted_labels = []

# Evaluate model on validation dataset and collect predictions
model.eval()
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Display confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Original', 'Fake'], yticklabels=['Original', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import classification_report

# Evaluate model on test data
def predict(model, dataloader):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
            labels = batch[2]

            outputs = model(**inputs)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, 1)

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted_labels.cpu().numpy())

    return all_labels, all_predictions

# Make predictions
true_labels, predicted_labels = predict(model, test_dataloader)

# Generate classification report
report = classification_report(true_labels, predicted_labels, target_names=['Orginal_text', 'Fake_text'])
print(report)




In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Initialize lists to accumulate labels and predictions during validation
all_val_labels = []
all_val_predictions = []

# Validation loop
for batch in test_dataloader:
    inputs = {'input_ids': batch[0].to(device),
              'attention_mask': batch[1].to(device),
              'labels': batch[2].to(device)}

    with torch.no_grad():
        outputs = model(**inputs)

        logits = outputs.logits
        predicted_probs = torch.softmax(logits, dim=1)

        # Accumulate labels and predictions
        all_val_labels.extend(inputs['labels'].tolist())
        all_val_predictions.extend(predicted_probs[:, 1].tolist())  # Assuming binary classification, using probabilities of positive class

# Compute ROC curve
fpr, tpr, _ = roc_curve(all_val_labels, all_val_predictions)

# Compute AUC score
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
h5_path = r"C:\Users\shuvo\TEXT FILE\model.h5"
torch.save(model.state_dict(), h5_path)

In [None]:
# Define the path to save the model
save_path = r"C:\Users\shuvo\TEXT FILE\model.pth"

# Save the model
torch.save(model.state_dict(), save_path)