## Installing Modules

In [None]:
!pip install transformers datasets evaluate

### Huggingface Hub Login

In [None]:
from huggingface_hub import login
login(token="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")

## Loading Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="dataset7.json")
# print(dataset['train'])
# print(f"sample data{dataset['train'][0]}")
train_dataset = load_dataset("json", data_files="dataset7.json", split="train[:80%]")
eval_dataset = load_dataset("json", data_files="dataset7.json", split="train[80%:90%]")
test_dataset = load_dataset("json", data_files="dataset7.json", split="train[90%:]")

In [None]:
print(f"Training dataset:{train_dataset}")
print(f"Evaluation dataset:{eval_dataset}")
print(f"Testing dataset:{test_dataset}")

## Preprocessing Dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

### Defining Labels

In [None]:
id2label = {0: "Benign", 1: "Malicious"}
label2id = {"Benign": 0, "Malicious": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="nosql-identifier-distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    logging_dir="./logs",  # Directory for storing logs
    logging_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.push_to_hub("ankush-003/nosql-identifier-distilbert")
tokenizer.push_to_hub("ankush-003/nosql-identifier-distilbert")

In [None]:
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
# Load the test dataset
# test_dataset = Dataset.load_from_disk("path_to_test_dataset")

# Load the fine-tuned BERT model and its tokenizer
model_path = "nosql-identifier-distilbert"
# model = BertForSequenceClassification.from_pretrained(model_path, use_auth_token=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define a function to compute metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Configure the training arguments for evaluation
training_args = TrainingArguments(
    per_device_eval_batch_size=8,
    output_dir="test_distilbert",  # Change this path accordingly
)

# Create the Trainer instance for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
import csv
fields = ["model_name","eval_loss","eval_accuracy","eval_precision","eval_recall","eval_f1","eval_runtime","eval_samples_per_second","eval_steps_per_second"]
# Evaluate the BERT model on the test dataset
evaluation_result = trainer.evaluate(tokenized_test_dataset)

print("distilBERT model evaluation result:", evaluation_result)
csv_file = "model_comparisions.csv"
try:
    df = pd.read_csv(csv_file)
except FileNotFoundError:
    df = pd.DataFrame()

# Append the new evaluation result along with the model name to the DataFrame
model_name = "distil-bert"  # Replace this with the actual model name
evaluation_result["model_name"] = model_name
df = df.append(evaluation_result, ignore_index=True)

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_file, index=False)