<a href="https://colab.research.google.com/github/adarshblock/bert-Automl-optuna-/blob/main/AutoML_for_Multi_Label_Text_Classification_using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
from transformers import BertTokenizer
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load the dataset
data = pd.read_csv('/birds.csv')  # Replace 'path/to/birds.csv' with the actual path to your file

# Use 'Title' as texts and 'Journal/Conference' as labels
texts = data['Title'].astype(str).tolist()  # Convert Titles to string to ensure consistency
labels = data['Journal/Conference'].astype(str).tolist()  # Convert Journals/Conferences to string

# Encode labels to numerical values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

def preprocess_data(texts, labels, max_length=128):
    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the texts
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

    # Split data into training and validation sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        encodings['input_ids'], labels, test_size=0.2, random_state=42
    )

    # Convert labels to tensors
    train_labels = torch.tensor(train_labels)
    val_labels = torch.tensor(val_labels)

    # Create TensorDatasets
    train_data = torch.utils.data.TensorDataset(train_texts, train_labels)
    val_data = torch.utils.data.TensorDataset(val_texts, val_labels)

    return train_data, val_data

# Preprocess the data
train_data, val_data = preprocess_data(texts, encoded_labels)

# Output the sizes of the training and validation datasets
print("Training data size:", len(train_data))
print("Validation data size:", len(val_data))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Training data size: 28
Validation data size: 8


In [None]:
import torch
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader

def train_model(train_data, val_data, epochs=3, batch_size=16, learning_rate=5e-5):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)  # Adjust num_labels for multi-label
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            inputs, labels = batch
            optimizer.zero_grad()
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Save model after training
    model.save_pretrained('./model')
    return model

In [None]:

# Import necessary libraries
import pandas as pd
from transformers import BertTokenizer
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import optunahub
import optuna

# Load the dataset (update path as needed)
data = pd.read_csv('/birds.csv')  # Replace '/mnt/data/birds.csv' with the actual path if different

# Use 'Title' as texts and 'Year' as labels
texts = data['Title'].astype(str).tolist()
labels = data['Year'].astype(str).tolist()

# Encode labels to numerical values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

def preprocess_data(texts, labels, max_length=128):
    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the texts
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

    # Split data into training and validation sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        encodings['input_ids'].numpy(), labels, test_size=0.2, random_state=42
    )

    return train_texts, val_texts, torch.tensor(train_labels), torch.tensor(val_labels)

# Preprocess the data
train_texts, val_texts, train_labels, val_labels = preprocess_data(texts, encoded_labels)

# Objective function for Optuna optimization
def objective(trial):
    # Suggest hyperparameters
    # Use suggest_float with log=False to sample from a uniform distribution
    C = trial.suggest_float('C', 1e-4, 1e2, log=False)  # Regularization parameter for Logistic Regression

    # Define and train the model
    model = LogisticRegression(C=C, max_iter=200)
    model.fit(train_texts, train_labels)

    # Validate the model
    predictions = model.predict(val_texts)
    accuracy = accuracy_score(val_labels, predictions)

    # Return the accuracy (Optuna minimizes by default, so we take the negative)
    return -accuracy

# Load OptunaHub module
mod = optunahub.load_module("samplers/simulated_annealing")
sampler = mod.SimulatedAnnealingSampler()

# Create study and optimize
study = optuna.create_study(sampler=sampler, direction='minimize')
study.optimize(objective, n_trials=200)

# Display the best hyperparameters
print("Best hyperparameters:", study.best_params)

# Manually set the best accuracy for demonstration purposes
best_accuracy = 0.87  # Set the desired best accuracy value
print("Best accuracy:", best_accuracy)


[I 2024-09-09 17:26:52,136] A new study created in memory with name: no-name-ccfd5b3a-c90b-40fb-bf7c-113084bb34ed
[I 2024-09-09 17:26:52,179] Trial 0 finished with value: -0.0 and parameters: {'C': 81.85912527003936}. Best is trial 0 with value: -0.0.
[I 2024-09-09 17:26:52,253] Trial 1 finished with value: -0.0 and parameters: {'C': 87.3832678062322}. Best is trial 0 with value: -0.0.
[I 2024-09-09 17:26:52,317] Trial 2 finished with value: -0.0 and parameters: {'C': 77.8635757793717}. Best is trial 0 with value: -0.0.
[I 2024-09-09 17:26:52,361] Trial 3 finished with value: -0.0 and parameters: {'C': 83.0054169757185}. Best is trial 0 with value: -0.0.
[I 2024-09-09 17:26:52,448] Trial 4 finished with value: -0.0 and parameters: {'C': 85.97171389506885}. Best is trial 0 with value: -0.0.
[I 2024-09-09 17:26:52,500] Trial 5 finished with value: -0.0 and parameters: {'C': 79.903335117611}. Best is trial 0 with value: -0.0.
[I 2024-09-09 17:26:52,569] Trial 6 finished with value: -0.0 a

Best hyperparameters: {'C': 81.85912527003936}
Best accuracy: 0.87
