<a href="https://colab.research.google.com/github/canstralian/HF-Spaces/blob/main/Transformers_and_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
!pip install numpy pandas gradio tensorflow torch matplotlib seaborn transformers datasets

In [None]:
# Import Libraries
from IPython import get_ipython
from IPython.display import display
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments AutoModel
from datasets import load_dataset
from huggingface_hub import notebook_login, HfFolder
from google.colab import userdata, auth, drive
import gspread
from google.auth import default
import gradio as gr
import ipywidgets as widgets

In [None]:
# Load model directly
model = AutoModel.from_pretrained("Canstralian/CySec_Known_Exploit_Analyzer")

In [None]:
def predict_function(input_text):
           # Preprocess the input text (e.g., tokenize)
           inputs = tokenizer(input_text, return_tensors="pt")
           # Get model predictions
           outputs = model(**inputs)
           # Postprocess the predictions (e.g., extract the most likely class)
           prediction = outputs.logits.argmax(-1).item()
           # Return the prediction
           return prediction

       iface = gr.Interface(
           fn=predict_function,
           inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
           outputs="text",
           title="Cybersecurity Exploit Analyzer",
           description="Enter text to analyze for potential exploits."
       )

       iface.launch()

In [None]:
text_input = widgets.Text(placeholder="Enter text here...")
       output_label = widgets.Label()

       def predict(change):
           input_text = text_input.value
           # Preprocess, predict, and postprocess as in the Gradio example
           # ...
           output_label.value = f"Prediction: {prediction}"

       text_input.observe(predict, names='value')

       display(text_input, output_label)

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Define metric for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (preds == labels).mean()}

# Update TrainingArguments with early stopping and best model selection
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate every epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    metric_for_best_model="accuracy",  # Use accuracy to select the best model
    load_best_model_at_end=True,  # Automatically load the best model after training
)

# Initialize the Trainer with early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Train dataset
    eval_dataset=tokenized_dataset["test"],    # Test dataset
    compute_metrics=compute_metrics,  # Provide custom metric function
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop training after 3 epochs without improvement
)

# Start the training process
trainer.train()

# Load Datasets

In [None]:
# Load the custom dataset
ds_wordlists = load_dataset("Canstralian/Wordlists")
ds_pentesting = load_dataset("Canstralian/pentesting_dataset")

# Display information about the dataset
print("Wordlists dataset structure:", ds_wordlists)
print("Pentesting dataset structure:", ds_pentesting)

# Show the first sample in each dataset
print("First example from Wordlists dataset:", ds_wordlists["train"][0])  # Adjust the split name if different
print("First example from Pentesting dataset:", ds_pentesting["train"][0])  # Adjust the split name if different

In [None]:
data = {'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}
df = pd.DataFrame(data)

In [None]:
data = [[1, 'a'], [2, 'b'], [3, 'c']]
df = pd.DataFrame(data, columns=['col1', 'col2'])

In [None]:
data = np.array([[1, 'a'], [2, 'b'], [3, 'c']])
df = pd.DataFrame(data, columns=['col1', 'col2'])

#Loading the Tokenizer and Model

In [None]:
from huggingface_hub import notebook_login, HfFolder
from google.colab import userdata

# Get your Hugging Face token from Colab's userdata
HF_TOKEN = userdata.get('HF_TOKEN')

# If HF_TOKEN is not found in userdata, prompt the user to enter it
if HF_TOKEN is None:
    HF_TOKEN = getpass.getpass("Enter your Hugging Face token: ")
    # Optionally, you can store the token in userdata for future use:
    # userdata.set('HF_TOKEN', HF_TOKEN)

# Save the token to Hugging Face folder
HfFolder.save_token(HF_TOKEN)

# Login to Hugging Face
notebook_login()

# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
gc = gspread.authorize(creds)

worksheet = gc.open('Your spreadsheet name').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()
print(rows)

# Convert to a DataFrame and render.
import pandas as pd
pd.DataFrame.from_records(rows)

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("my_fine_tuned_model")
tokenizer.save_pretrained("my_fine_tuned_model")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("my_fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("my_fine_tuned_model")


# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define a function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenizer to the dataset
# Replace `ds_wordlists` with `ds_pentesting` if you want to tokenize that dataset instead
tokenized_dataset = ds_wordlists.map(tokenize_function, batched=True)

# If you're working with a different dataset, replace `ds_wordlists` with the appropriate one

In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer with the model, arguments, and data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # This line is fine now
    eval_dataset=tokenized_dataset["test"],    # Add a comma here to separate arguments
)

# Start the training process
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111364677777197, max=1.0)…

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
