<a href="https://colab.research.google.com/github/abhisheksingh1234/DataScience/blob/master/DiTil_Bert_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import os
import torch.nn as nn

pos_folder = "/content/drive/MyDrive/DistilBERT/test/neg/"  # Replace with your positive samples folder path
neg_folder = "/content/drive/MyDrive/DistilBERT/test/pos/"

# Load positive and negative samples
pos_texts, pos_labels = load_samples_from_folder(pos_folder)

neg_texts, neg_labels = load_samples_from_folder(neg_folder)


# Concatenate positive and negative samples
texts = pos_texts + neg_texts
labels = pos_labels + neg_labels


# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize input texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create TensorDatasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

# Define batch size and create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # 2 classes for binary classification

# Add a dropout layer to the classifier to prevent overfitting
dropout_prob = 0.1
model.classifier.dropout = nn.Dropout(dropout_prob)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tuning loop
epochs = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Average Loss: {average_loss}")

# Evaluation on validation set
model.eval()
val_accuracy = 0
threshold = 0.5  # Adjust the threshold as needed

for batch in val_loader:
    input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = F.sigmoid(logits)  # Apply sigmoid activation

    preds = (probabilities > threshold).int()  # Predict based on the threshold
    val_accuracy += torch.sum(preds == labels).item()

val_accuracy /= len(val_dataset)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def load_samples_from_folder(folder):
    texts = []
    labels = []
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            texts.append(text)
            if "pos" in folder:
                labels.append(1)  # Assign label 1 for positive samples
            elif "neg" in folder:
                labels.append(0)  # Assign label 0 for negative samples
    return texts, labels

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive
