# Loading the data

In [1]:
!pip install datasets evaluate wandb

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

## Transforming the DataFrames into DataLoaders

In [2]:
import transformers
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("news_processed_spacy.csv")

In [4]:
data.columns

Index(['text', 'polarity', 'n_tokens', 'total_n_ents', 'n_org_ents',
       'n_per_ents', 'n_gpe_ents', 'n_norp_ents', 'n_date_ents', 'entities',
       'org_ents', 'per_ents', 'gpe_ents', 'norp_ents', 'date_ents'],
      dtype='object')

In [5]:
data = data[["text", "polarity"]]
data.rename({"polarity": "labels"}, axis=1, inplace=True)

In [6]:
data.replace({"True": 1, "Fake":0}, inplace=True)

  data.replace({"True": 1, "Fake":0}, inplace=True)


In [7]:
import json

with open("data_split_indexes.json", "r") as f:
    data_split_indexes = json.load(f)

In [8]:
training_portion = data[data.index.isin(data_split_indexes["training"])]
training_portion.shape

(4434, 2)

In [9]:
testing_portion = data[data.index.isin(data_split_indexes["testing"])]
testing_portion.shape

(950, 2)

In [10]:
validation_portion = data[data.index.isin(data_split_indexes["validation"])]
validation_portion.shape

(950, 2)

In [11]:
label2id = {"True": 1, "Fake":0}
id2label = {value: key for key, value in label2id.items()}

print(id2label)
print(label2id)

{1: 'True', 0: 'Fake'}
{'True': 1, 'Fake': 0}


In [12]:
from datasets import Dataset

train_dataloader = Dataset.from_pandas(training_portion).remove_columns("__index_level_0__")
validation_dataloader = Dataset.from_pandas(validation_portion).remove_columns("__index_level_0__")
test_dataloader = Dataset.from_pandas(testing_portion).remove_columns("__index_level_0__")


In [13]:
train_dataloader

Dataset({
    features: ['text', 'labels'],
    num_rows: 4434
})

In [14]:
train_dataloader[0]

{'text': 'Kerry to go to Paris in gesture of sympathyU.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.\n\nKerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the next day with Foreign Minister Laurent Fabius and President Francois Hollande, then return to Washington.\n\nThe visit by Kerry, who has family and childhood ties to the country and speaks fluent French, could address some of the criticism that the United States snubbed France in its darkest hour in many years.\n\nThe French press on Monday was filled with questions about why neither President Obama nor Kerry attended Sunday’s march, as about 40 leaders of other nations did. Obama was said to have stayed away because

# Transformer Model

In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

model = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2, id2label=id2label, label2id=label2id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
    )

batch_size = 32

tokenized_train_dataloader = train_dataloader.map(preprocess_function, batched=True, batch_size=batch_size)
tokenized_validation_dataloader = validation_dataloader.map(preprocess_function, batched=True, batch_size=batch_size)
tokenized_test_dataloader = test_dataloader.map(preprocess_function, batched=True, batch_size=batch_size)

Map:   0%|          | 0/4434 [00:00<?, ? examples/s]

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

In [17]:
tokenized_train_dataloader

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 4434
})

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
import evaluate

f1 = evaluate.load("f1")
# precision = evaluate.load("precision")
# recall = evaluate.load("recall")

def evaluate(prediction_array):
    logits, labels = prediction_array
    predictions = np.argmax(logits, axis=1)

    f1_score = f1.compute(predictions=predictions, references=labels)
    # precision_score = precision.compute(predictions=predictions, references=labels)
    # recall_score = recall.compute(predictions=predictions, references=labels)

    return f1_score

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [20]:
import torch
torch.device("cuda")

device(type='cuda')

In [21]:
torch.cuda.is_available()

True

In [22]:
import wandb
import os

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="make-believe16_transformer_model"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"



In [23]:
num_epochs = 20
learning_rate = 0.00002

training_args = TrainingArguments(
    output_dir="Transformer Model",
    report_to="wandb",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataloader,
    eval_dataset=tokenized_validation_dataloader,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=evaluate,

)

In [24]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,F1
1,0.2336,0.113108,0.96208
2,0.0744,0.081173,0.974529
3,0.0326,0.14323,0.968109
4,0.0145,0.112671,0.972912
5,0.0109,0.090912,0.983352
6,0.0111,0.158758,0.973684
7,0.0042,0.102293,0.982222
8,0.0001,0.126711,0.980132
9,0.0002,0.131169,0.983389


[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-185)... Done. 5.4s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-370)... Done. 6.5s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-555)... Done. 4.0s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-740)... Done. 52.2s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-925)... Done. 19.4s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-1110)... Done. 60.3s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-1295)... Done. 13.6s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-1480)... Done. 4.1s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-1665)... Done. 52.7s


Epoch,Training Loss,Validation Loss,F1
1,0.2336,0.113108,0.96208
2,0.0744,0.081173,0.974529
3,0.0326,0.14323,0.968109
4,0.0145,0.112671,0.972912
5,0.0109,0.090912,0.983352
6,0.0111,0.158758,0.973684
7,0.0042,0.102293,0.982222
8,0.0001,0.126711,0.980132
9,0.0002,0.131169,0.983389
10,0.0001,0.137801,0.981215


[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-1850)... Done. 4.9s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-2035)... Done. 49.5s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-2220)... Done. 4.1s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-2405)... Done. 56.6s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-2590)... Done. 58.9s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-2775)... Done. 5.7s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-2960)... Done. 59.0s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-3145)... Done. 57.1s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-3330)... Done. 19.3s
[34m[1mwandb[0m: Adding directory to artifact (./Transformer Model/checkpoint-3515

TrainOutput(global_step=3700, training_loss=0.019223348349647444, metrics={'train_runtime': 5334.6348, 'train_samples_per_second': 16.623, 'train_steps_per_second': 0.694, 'total_flos': 1.174720891281408e+16, 'train_loss': 0.019223348349647444, 'epoch': 20.0})

In [25]:
evaluation = trainer.evaluate(tokenized_test_dataloader)

In [None]:
evaluation

In [27]:
wandb.finish()

0,1
eval/f1,▁▅▃▅█▅█▇█▇█▃▇▇████▇▇▂
eval/loss,▂▁▄▂▁▄▂▃▃▃▃█▃▄▄▃▄▃▄▄▃
eval/runtime,▁▁▃▂▂▃▃▄▃▃▃▃▄▃▄▄▄▄▃█▆
eval/samples_per_second,██▆▇▇▆▆▅▅▆▆▆▅▆▅▅▅▅▆▁▃
eval/steps_per_second,██▆▇▇▆▆▅▅▆▅▆▅▆▅▅▅▅▆▁▃
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
train/grad_norm,▂▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
train/loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f1,0.96373
eval/loss,0.12848
eval/runtime,16.0838
eval/samples_per_second,59.066
eval/steps_per_second,2.487
total_flos,1.174720891281408e+16
train/epoch,20.0
train/global_step,3700.0
train/grad_norm,0.00034
train/learning_rate,0.0


In [28]:
trainer.save_model("distilbert-make-believe16")