In [None]:
import pandas as pd

df = pd.read_excel("data", index_col=0)
df

In [None]:
df.isnull().sum()

In [None]:
df1 = df.copy().dropna()
df1.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df1['label'].value_counts(ascending=True).plot.bar(color=['green','blue'])

In [None]:
import numpy as np

# 1.5 tokens per word on average
np.mean(df1['title'].str.split().apply(len)*1.5)

In [None]:
df1['title_tokens'] = df1['title'].str.split().apply(len)*1.5
df1['text_tokens'] = df1['text'].str.split().apply(len)*1.5

df1

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))

ax[0].hist(df1['title_tokens'], bins=50, color = 'green')
ax[1].hist(df1['text_tokens'], bins=50, color = 'blue')

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df1, test_size=0.3, stratify=df1['label'])
test, validation = train_test_split(test, test_size=1/3, stratify=test['label'])

train.shape, test.shape, validation.shape, df1.shape


In [None]:
from datasets import Dataset, DatasetDict


dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train, preserve_index=False),
        "test": Dataset.from_pandas(test, preserve_index=False),
        "validation": Dataset.from_pandas(validation, preserve_index=False)
    }
)

dataset

In [None]:
import os

os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'False'

In [None]:
from transformers import AutoTokenizer

model_ckpt = "data"
distilbert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_ckpt = "data"
mobilebert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_ckpt = "data"
tinybert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "Hello World!"

distilbert_tokenizer(text), mobilebert_tokenizer(text), tinybert_tokenizer(text)

In [None]:
def tokenize(batch):
    temp = distilbert_tokenizer(batch['title'], padding=True, truncation=True)
    return temp

In [None]:
encoded_dataset = dataset.map(tokenize, batch_size=None, batched=True)
encoded_dataset

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig
import torch

label2id = {"Real": 0, "Fake": 1}
id2label = {0:"Real", 1:"Fake"}

model_ckpt = "data"
# model_ckpt = "data"
# model_ckpt = "data"

In [None]:
num_labels = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics_evaluate(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments

batch_size = 32
training_dir = "train_dir"

training_args = TrainingArguments(
                                  output_dir=training_dir,
                                  overwrite_output_dir = True,
                                  num_train_epochs = 2,
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy = 'epoch'
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics_evaluate,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=distilbert_tokenizer
)

In [None]:
trainer.train()

In [None]:
preds_output = trainer.predict(encoded_dataset['test'])
preds_output.metrics

In [None]:
y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = encoded_dataset['test'][:]['label']

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=list(label2id)))

In [None]:
# use sklearn to build compute metrics
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)

    return {"accuracy": acc, "f1": f1}

In [None]:
model_dict = {
}

def train_model(model_name):
    model_ckpt = model_dict[model_name]
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

    def local_tokenizer(batch):
        temp = tokenizer(batch['title'], padding=True, truncation=True)
        return temp
    
    encoded_dataset = dataset.map(local_tokenizer, batched=True, batch_size=None)

    trainer = Trainer(
                model=model,
                compute_metrics=compute_metrics,
                train_dataset=encoded_dataset['train'],
                eval_dataset=encoded_dataset['validation'],
                tokenizer=tokenizer
            )
    
    trainer.train()

    preds = trainer.predict(encoded_dataset['test'])

    return preds.metrics
    

import time
model_performance = {}
for model_name in model_dict:
    print("\n\n")
    print("Training Model: ", model_name)

    start = time.time()
    result = train_model(model_name)
    end = time.time()
    
    model_performance[model_name] = {model_name:result, "time taken": end-start}

In [None]:
model_performance

In [None]:
trainer.save_model("fake_news")

In [None]:
from transformers import pipeline

classifier = pipeline('text-classification', model= 'fake_news')