In [None]:
# !pip install carbontracker

In [None]:
import os
import copy
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split, KFold

import transformers
from transformers import get_linear_schedule_with_warmup

import re

  from tqdm.autonotebook import tqdm


`Dataset`: Building a pytorch dataset that can be fed into the pretrained model

In [None]:
df = pd.read_csv('/content/drive/MyDrive/fake_or_real_news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,Text,Polarity
0,8476,You Can Smell Hillary‚Äôs Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"‚Äî Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
df = df.drop_duplicates()
# df = df.drop(['title', 'Unnamed: 0'], axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,title,Text,Polarity
0,8476,You Can Smell Hillary‚Äôs Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"‚Äî Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


Clean data

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def clean_tweet(data):
    if type(data) == float:
        return ""
    temp = data.lower()
    temp = re.sub("'", "", temp)  # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+", "", temp)  # remove mentions
    temp = re.sub("#[A-Za-z0-9_]+", "", temp)  # remove hashtags
    temp = re.sub(r'http\S+', '', temp)  # remove urls
    temp = re.sub('[()!?]', ' ', temp)  # remove special characters
    temp = re.sub('\[.*?\]', ' ', temp)
    temp = re.sub("[^a-z0-9]", " ", temp)  # convert all to lower case
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords.words('english')]
    # temp = [stemmer.stem(word) for word in tweet]
    temp = " ".join(word for word in temp)
    return temp
df['Text'] = df['Text'].apply(clean_tweet)

In [None]:
id2label = {str(i): label for i, label in enumerate(df["Polarity"].unique().tolist())}
label2id = {v: k for k, v in id2label.items()}

print(label2id)

{'FAKE': '0', 'REAL': '1'}


In [None]:
# Create a "labels" column from the label2id mapping
df = (df.assign(labels=df["Polarity"].map(label2id)) # Create a labels column (for expected DistilBERT input)
             )
df.tail()

Unnamed: 0.1,Unnamed: 0,title,Text,Polarity,labels
6330,4490,State Department says it can't find emails fro...,state department told republican national comm...,REAL,1
6331,8062,The ‚ÄòP‚Äô in PBS Should Stand for ‚ÄòPlutocratic‚Äô ...,p pbs stand plutocratic pentagon posted oct 27...,FAKE,0
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,anti trump protesters tools oligarchy reform a...,FAKE,0
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...",addis ababa ethiopia president obama convened ...,REAL,1
6334,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,jeb bush suddenly attacking trump heres matter...,REAL,1


Create a pytorch dataset

In [None]:
!pip install datasets
!pip install transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m507.1/507.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m115.3/115.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m134.8/134.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing colle

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df).train_test_split(train_size=0.8, seed=123)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'Text', 'Polarity', 'labels', '__index_level_0__'],
        num_rows: 5068
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'Text', 'Polarity', 'labels', '__index_level_0__'],
        num_rows: 1267
    })
})


In [None]:
dataset = dataset.class_encode_column("labels")

Flattening the indices:   0%|          | 0/5068 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/5068 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1267 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1267 [00:00<?, ? examples/s]

Tokenization

In [None]:
cols_to_remove = [col for col in dataset["train"].column_names if col != "labels"]
print(cols_to_remove)

['Unnamed: 0', 'title', 'Text', 'Polarity', '__index_level_0__']


In [None]:
from transformers import AutoTokenizer

# Load Distilbert tokenizer and tokenize the texts
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# tokenise and encode the dataset
def tokenise(batch):
  tokenised_batch = tokenizer(batch['Text'], padding=True, truncation=True, max_length=128)
  return tokenised_batch

dataset_enc = dataset.map(tokenise, batched=True, remove_columns=cols_to_remove, num_proc=4)

# Set dataset format for pytorch
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print(dataset_enc['train'].column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/5068 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1267 [00:00<?, ? examples/s]

['labels', 'input_ids', 'attention_mask']


In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# instantiate data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create data loaders to reshape data for PyTorch model
train_dataLoader = DataLoader(dataset_enc['train'],
                           shuffle=True,
                           batch_size=8,
                           collate_fn=data_collator)

eval_dataLoader = DataLoader(dataset_enc['test'], batch_size=8, collate_fn=data_collator)

In [None]:
from transformers import AutoModelForSequenceClassification

# Dynamically set number of class labels based on dataset
num_labels = dataset['train'].features['labels'].num_classes
print(f"Number of labels: {num_labels}")

print(f"Number of labels: {num_labels}")

# Load model from checkpoint
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

Number of labels: 2
Number of labels: 2


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define hyperparameters, optimizer and learning rate scheduler

In [None]:
from transformers import AdamW
from transformers import get_scheduler

# Model parameters
learning_rate = 5e-5
num_epochs = 5

# Create optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Learning rate scheduler
num_training_batches = len(train_dataLoader)
num_training_steps = num_epochs * num_training_batches
lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Move model to device
model.to(device)



cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

# Train the model with PyTorch training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataLoader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3170 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# Save model
# model.save_pretrained(f"{root_dir}/fakenews")

NameError: name 'root_dir' is not defined

In [None]:
from datasets import load_metric

# Load metric
metric = load_metric("glue", "mrpc")

# Iteratively evaluate the model and compute metrics
model.eval()
for batch in eval_dataLoader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

# Get model accuracy and F1 score
metric.compute()

  metric = load_metric("glue", "mrpc")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

{'accuracy': 0.9534333070244673, 'f1': 0.9557389347336834}

In [None]:
# Inferencing the model
data = ["Trump is dead"]

# Tokenize inputs
inputs = tokenizer(data, padding=True, truncation=True, return_tensors="pt").to(device)

# Inference model and get logits
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.6704, -2.9458]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
# Convert logits to class probabilities
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.9964, 0.0036]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
