In [1]:
pip install transformers



In [3]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Original dataset
texts = [
    "This is an educational article about history.",
    "The cultural festival celebrates diversity and tradition.",
    "A new exhibit at the museum explores ancient civilizations.",
    "Traditional dance performances showcase our cultural heritage.",
    "Learn about different cultures through literature and art."
]
labels = [
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational"
]

# Additional data
additional_texts = [
    "Mathematics is a fundamental subject in education.",
    "Artifacts from various cultures are on display at the museum.",
    "Cultural exchange programs promote understanding between nations.",
    "The history of civilization dates back thousands of years.",
    "Educational institutions play a vital role in society.",
    "Cultural traditions are passed down from generation to generation.",
    "Learning about different cultures broadens our perspectives.",
    "The cultural significance of music varies across societies.",
    "Education is the key to unlocking opportunities.",
    "Cultural diversity enriches our communities.",
    "Reading books opens new worlds of knowledge.",
    "Exploring different cuisines exposes us to diverse cultures.",
    "Understanding historical events helps us learn from the past.",
    "Traditional clothing reflects cultural identities.",
    "Studying foreign languages enhances cross-cultural communication.",
    "Participating in cultural events fosters a sense of belonging.",
    "Critical thinking skills are essential for educational success.",
    "Preserving cultural heritage is important for future generations.",
    "The arts provide insights into different cultural perspectives.",
    "Educational policies impact access to quality education.",
    "Cultural sensitivity is important in a globalized world.",
    # Additional sample texts
    "Science education fosters innovation and discovery.",
    "Cultural festivals showcase local traditions and customs.",
    "History lessons teach us about the past and its impact on the present.",
    "Art education encourages creativity and self-expression.",
    "Cultural diversity promotes tolerance and understanding.",
    "Geography classes explore the diverse landscapes of our planet.",
    "Music education enhances cognitive skills and emotional development.",
    "Cultural awareness fosters empathy and respect for others.",
    "Physical education promotes health and well-being.",
    "Literature exposes us to different worldviews and perspectives."
]

additional_labels = [
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    "educational",
    "cultural",
    # Additional sample labels
    "educational",
    "cultural",
    "educational",
    "educational",
    "cultural",
    "educational",
    "educational",
    "cultural",
    "educational",
    "educational",
    "educational"
]

# Combine original and additional data
texts += additional_texts
labels += additional_labels
# Tokenize texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Convert labels to numeric form
label_dict = {"educational": 0, "cultural": 1}
numeric_labels = [label_dict[label] for label in labels]

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(encoded_texts['input_ids'], numeric_labels, test_size=0.5, random_state=42)

# Convert data into PyTorch tensors
train_dataset = TensorDataset(train_texts, torch.tensor(train_labels))
test_dataset = TensorDataset(test_texts, torch.tensor(test_labels))

# Define the model
model = BertModel.from_pretrained('bert-base-uncased')
classifier = nn.Linear(768, 2)  # 2 output classes for binary classification
model.classifier = classifier  # Replace the existing classifier

# Define loss function, optimizer, and dataloaders
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Training loop
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=3):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        total_correct = 0
        total_samples = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, labels = batch
            outputs = model(input_ids)
            logits = outputs.last_hidden_state.mean(dim=1)  # Pooling strategy, you may need to adjust this based on your task
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
        print(f'Epoch {epoch+1}/{num_epochs}, Training loss: {total_loss/len(train_loader)}, Training accuracy: {total_correct/total_samples * 100:.2f}%')
        evaluate_model(model, test_loader, criterion)

def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, labels = batch
            outputs = model(input_ids)
            logits = outputs.last_hidden_state.mean(dim=1)  # Pooling strategy
            loss = criterion(logits, labels)
            total_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    print(f'Test loss: {total_loss/len(test_loader)}, Test accuracy: {total_correct/total_samples * 100:.2f}%')

# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=10)


Epoch 1/10, Training loss: 6.406407594680786, Training accuracy: 0.00%
Test loss: 5.635807037353516, Test accuracy: 33.33%
Epoch 2/10, Training loss: 5.104282855987549, Training accuracy: 44.44%
Test loss: 4.556516647338867, Test accuracy: 27.78%
Epoch 3/10, Training loss: 4.04052996635437, Training accuracy: 61.11%
Test loss: 3.953909158706665, Test accuracy: 27.78%
Epoch 4/10, Training loss: 3.479784607887268, Training accuracy: 55.56%
Test loss: 3.3945900201797485, Test accuracy: 27.78%
Epoch 5/10, Training loss: 2.6843369007110596, Training accuracy: 55.56%
Test loss: 2.7973155975341797, Test accuracy: 27.78%
Epoch 6/10, Training loss: 1.875372052192688, Training accuracy: 55.56%


KeyboardInterrupt: 