<a href="https://colab.research.google.com/github/adarshblock/Automl-project/blob/main/Automl_optuna%2Bbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import optuna
from optuna.samplers import TPESampler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
import sklearn.datasets
from sklearn.model_selection import cross_val_score
from sklearn.metrics import matthews_corrcoef
import sklearn.linear_model
import sklearn.model_selection

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=2)

# Preprocess data
X = list(df_training['DESCRIPTION'])
y = list(df_training['LABEL'])
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=2018, test_size=0.20)

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)


# Create Torch Dataset

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def objective(trial: optuna.Trial):
    model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=2)

    training_args = TrainingArguments(
        output_dir='ade-test',
        learning_rate=trial.suggest_loguniform('learning_rate', low=1e-6, high=1e-4),
        weight_decay=trial.suggest_loguniform('weight_decay', 1e-6, 0.01),
        num_train_epochs=trial.suggest_int('num_train_epochs', low = 2,high= 5),
        per_device_train_batch_size=trial.suggest_categorical("per_device_train_batch_size",[4,8,16]),
        per_device_eval_batch_size=trial.suggest_categorical("per_device_eval_batch_size", [4,8,16]),
        warmup_ratio=trial.suggest_float("warmup_ration",0.01,0.3),
        adam_epsilon=trial.suggest_float("adam_epsilon", 1e-12, 1e-8),
        disable_tqdm=True
    )
    trainer = Trainer(model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    result = trainer.train()



    # Define test trainer
    test_trainer = Trainer(model)

    # Make prediction
    raw_pred, _, _ = test_trainer.predict(val_dataset)

    # Preprocess raw predictions
    y_pred = np.argmax(raw_pred, axis=1)

    mcc = matthews_corrcoef(y_val, y_pred)

    #return result.training_loss
    return mcc

# We want to maximize the MCC!
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name='hyper-parameter-search', direction='maximize', sampler=sampler)
# Optimize the objective using 15 different trials
study.optimize(func=objective, n_trials=15, gc_after_trial=True)
# Gives the best loss value
print(study.best_value)
# Gives the best hyperparameter values to get the best loss value print(study.best_params)
# Return info about best Trial such as start and end datetime, hyperparameters
print(study.best_trial)

trial = study.best_trial
print('MCC:{}'.format(trial.value))
print('Best hyperparameters:{}'.format(trial.params))

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
best_model_to_save = model.module if hasattr(trial, 'module') else model  # Take care of distributed/parallel training
best_model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Copy the model files to a directory in your Google Drive.
!cp -r ./model_save/ "/content/drive/MyDrive/Best_Fine_tuned_model"

#load the model
loaded_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Best_Fine_tuned_model/model_save')

# Define test trainer
test_trainer = Trainer(loaded_model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(val_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1).flatten()

true_label_testing = y_val

predictions_test_flat = y_pred

cr =(classification_report(true_label_testing, predictions_test_flat))
print(cr)

mcc = matthews_corrcoef(true_label_testing, predictions_test_flat)