In [1]:
import tensorflow
tensorflow.__version__

'2.18.0'

In [1]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="data_set.csv")["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
dataset = dataset.remove_columns(['title', 'ai_generated'])

In [3]:
dataset

Dataset({
    features: ['abstract', 'label'],
    num_rows: 4053
})

In [4]:
dataset = dataset.shuffle(seed=42)

In [5]:
subset = dataset.select(range(1200))
train_dataset = subset.select(range(1000))
test_dataset = subset.select(range(1000, 1200))

In [6]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def tokenize(examples):
    return tokenizer(examples["abstract"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="results",                     # Where to save model checkpoints
    eval_strategy="epoch",              # Evaluate after each epoch
    save_strategy="epoch",                    # Save checkpoint after each epoch
    learning_rate=2e-5,                       # Good default for transformers
    per_device_train_batch_size=8,            # Adjust to your GPU
    per_device_eval_batch_size=16,            # Larger batch for evaluation
    num_train_epochs=3,                       # Set how many times to train over data
    weight_decay=0.01,                        # Helps reduce overfitting
    load_best_model_at_end=True,              # Restore best checkpoint at the end
    metric_for_best_model="accuracy",         # Use accuracy to select best model
    logging_dir="logs",                       # TensorBoard log dir
    logging_strategy="epoch",                 # Log metrics every epoch
)


In [9]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics       # Optional but needed for accuracy, etc.
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mameyparashar-ap[0m ([33mameyparashar-ap-delhi-technological-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1636,0.009875,1.0
2,0.0216,0.066202,0.98
3,0.0093,0.001889,1.0


TrainOutput(global_step=375, training_loss=0.06482601197560628, metrics={'train_runtime': 214.8335, 'train_samples_per_second': 13.964, 'train_steps_per_second': 1.746, 'total_flos': 397402195968000.0, 'train_loss': 0.06482601197560628, 'epoch': 3.0})

In [12]:
inputs = tokenizer("This is a sample input text", return_tensors="pt", truncation=True, padding=True)

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
from datasets import Dataset


In [37]:
text_list = ['''
   In this paper, the performance of signaling strategies with high
peak-to-average power ratio is analyzed in both coherent and noncoherent fading
channels. Two recently proposed modulation schemes, namely on-off binary
phase-shift keying and on-off quaternary phase-shift keying, are considered.
For these modulation formats, the optimal decision rules used at the detector
are identified and analytical expressions for the error probabilities are
obtained. Numerical techniques are employed to compute the error probabilities.
It is concluded that increasing the peakedness of the signals results in
reduced error rates for a given power level and hence improve the energy
efficiency.

''']
# Tokenize
tokenized = tokenizer(text_list, padding="max_length", truncation=True, max_length=128)

# Create a Hugging Face Dataset
input_dataset = Dataset.from_dict(tokenized)

In [38]:
predictions = trainer.predict(input_dataset)
import numpy as np

predicted_classes = np.argmax(predictions.predictions, axis=1)
print(predicted_classes)

[0]


In [21]:
trainer.evaluate(eval_dataset=test_dataset)


{'eval_loss': 0.009874851442873478,
 'eval_accuracy': 1.0,
 'eval_runtime': 3.7123,
 'eval_samples_per_second': 53.875,
 'eval_steps_per_second': 3.502,
 'epoch': 3.0}