In [None]:
!pip install datasets



In [None]:
!pip install umap-learn



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import re
import nltk
from typing import Dict, List, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from umap import UMAP
from datasets import Dataset, DatasetDict
from tqdm import tqdm
from nltk.corpus import stopwords as nltk_stopwords
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed
)
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score
)

set_seed(27)
tqdm.pandas()
nltk.download('stopwords')
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_ds = pd.read_csv("train.csv")
test_ds = pd.read_csv("test.csv")

# Used later for creating the submission file
test_ds_ids = test_ds['id']


In [None]:
def create_new_text(row):
    return row["author"] + " " + row["title"] + " " + row["text"]


train_ds["author"] = train_ds["author"].fillna("")
train_ds["title"] = train_ds["title"].fillna("")
train_ds["text"] = train_ds["text"].fillna("")
train_ds["new_text"] = train_ds.apply(create_new_text, axis=1)

test_ds["author"] = test_ds["author"].fillna("")
test_ds["title"] = test_ds["title"].fillna("")
test_ds["text"] = test_ds["text"].fillna("")
test_ds["new_text"] = test_ds.apply(create_new_text, axis=1)

In [None]:
def get_token_counts(dataset: pd.DataFrame) -> Dict:
    token_counts = {"test": []}
    for _, row in dataset.iterrows():
        token_count = len(row["new_text"].split(" "))
        if "label" in list(dataset.columns):
            if row["label"] not in token_counts:
                token_counts[row["label"]] = [token_count]
            else:
                token_counts[row["label"]].append(token_count)
        else:
            token_counts["test"].append(token_count)
    return token_counts


train_counts = get_token_counts(train_ds.copy())
test_counts = get_token_counts(test_ds.copy())

In [None]:
def get_tfidf_vectors(corpus: np.ndarray, stop_words: str, max_features: int, n: int) -> np.ndarray:
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features, ngram_range=(n, n))
    vectorized = vectorizer.fit_transform(corpus)
    return vectorized


corpus = train_ds.copy()['new_text'].values
labels = train_ds.copy()['label'].values
max_features = 300

unigram_vectors_with_stopwords = get_tfidf_vectors(corpus, 'english', max_features, 1)
bigram_vectors_with_stopwords = get_tfidf_vectors(corpus, 'english', max_features, 2)
unigram_vectors_without_stopwords = get_tfidf_vectors(corpus, None, max_features, 1)
bigram_vectors_without_stopwords = get_tfidf_vectors(corpus, None, max_features, 2)

In [None]:
def plot_tfidf_vectors(X: np.ndarray, labels: np.ndarray, title: str) -> None:
    pca = PCA(n_components=2, random_state=27).fit_transform(X.toarray())
    tsne = TSNE(n_components=2, init="random", random_state=27, perplexity=30, n_iter=1000, n_jobs=-1).fit_transform(X)
    umap = UMAP(n_components=2, random_state=27, n_neighbors=5, min_dist=0.8, n_jobs=-1).fit_transform(X.toarray())

    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))

    axes[0].scatter(pca[:,0], pca[:,1], c=labels, alpha=0.05, cmap='coolwarm')
    axes[0].set_title('PCA', fontsize=10)
    axes[1].scatter(umap[:,0], umap[:,1], c=labels, alpha=0.05, cmap='coolwarm')
    axes[1].set_title('UMAP', fontsize=10)
    axes[2].scatter(tsne[:,0], tsne[:,1], c=labels, alpha=0.05, cmap='coolwarm')
    axes[2].set_title('t-SNE', fontsize=10)

    plt.suptitle(title, fontsize=16)
    plt.tight_layout()
    plt.show()

In [None]:
class Preprocessor:
    @staticmethod
    def convert_to_lowercase(dataframe: pd.DataFrame) -> pd.DataFrame:
        tqdm.pandas(desc="Converting to lowercase")
        dataframe.loc[:, "new_text"] = dataframe["new_text"].progress_apply(
            lambda x: x.lower()
        )
        return dataframe

    @staticmethod
    def remove_punctuation(dataframe: pd.DataFrame) -> pd.DataFrame:
        def clean(text):
            text = text.replace("\n", " ")
            text = re.sub(r'[^a-z]+', ' ', text)
            text = re.sub(r'\s+', ' ', text)
            return text

        tqdm.pandas(desc="Removing punctuation")
        dataframe.loc[:, "new_text"] = dataframe["new_text"].progress_apply(lambda x: clean(x))

        return dataframe

    def remove_stopwords(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        stopwords = nltk_stopwords.words('english')

        tqdm.pandas(desc="Removing stopwords")
        dataframe.loc[:, "new_text"] = dataframe["new_text"].progress_apply(
            lambda x: " ".join(
                [word for word in x.split() if word not in stopwords]
            )
        )
        return dataframe

    def sanitize(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        dataframe = self.convert_to_lowercase(dataframe)
        dataframe = self.remove_punctuation(dataframe)
        dataframe = self.remove_stopwords(dataframe)
        return dataframe[["new_text", "label"]]

In [None]:
def parse_ngrams(corpus: np.ndarray, n_gram: Tuple[int, int]) -> List[Tuple[str, int]]:
    vec = CountVectorizer(ngram_range=n_gram).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in tqdm(vec.vocabulary_.items(), desc=f"Processing {n_gram}-grams")]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq


cleaned_ds = Preprocessor().sanitize(train_ds.copy())
fake_corpus = cleaned_ds[cleaned_ds.label == 1].new_text.values
real_corpus = cleaned_ds[cleaned_ds.label == 0].new_text.values

Converting to lowercase: 100%|██████████| 20800/20800 [00:00<00:00, 29465.38it/s]
Removing punctuation: 100%|██████████| 20800/20800 [00:11<00:00, 1877.25it/s]
Removing stopwords: 100%|██████████| 20800/20800 [00:27<00:00, 753.29it/s]


In [None]:
fake_words = parse_ngrams(fake_corpus, (1, 1))
fake_bigrams = parse_ngrams(fake_corpus, (2, 2))
fake_trigrams = parse_ngrams(fake_corpus, (3, 3))

real_words = parse_ngrams(real_corpus, (1, 1))
real_bigrams = parse_ngrams(real_corpus, (2, 2))
real_trigrams = parse_ngrams(real_corpus, (3, 3))

Processing (1, 1)-grams: 100%|██████████| 104650/104650 [00:00<00:00, 876033.41it/s]
Processing (2, 2)-grams: 100%|██████████| 1966308/1966308 [00:02<00:00, 804288.36it/s]
Processing (3, 3)-grams: 100%|██████████| 2833894/2833894 [00:03<00:00, 814531.33it/s]
Processing (1, 1)-grams: 100%|██████████| 94884/94884 [00:00<00:00, 851787.90it/s]
Processing (2, 2)-grams: 100%|██████████| 2901277/2901277 [00:03<00:00, 838669.96it/s]
Processing (3, 3)-grams: 100%|██████████| 4576831/4576831 [00:06<00:00, 751860.69it/s]


In [None]:
# Tokenize datasets
MODEL_NAME = "roberta-base"
BATCH_SIZE = 8
EPOCHS = 5
LOG_AND_EVAL_STEPS = 1000

In [None]:
train_ds = train_ds[['new_text', 'label']]
test_ds = test_ds[['new_text']]

train_size = int(0.9 * len(train_ds))

train_df = train_ds[:train_size]
val_df = train_ds[train_size:]

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "val": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_ds)
})
dataset

DatasetDict({
    train: Dataset({
        features: ['new_text', 'label'],
        num_rows: 18720
    })
    val: Dataset({
        features: ['new_text', 'label'],
        num_rows: 2080
    })
    test: Dataset({
        features: ['new_text'],
        num_rows: 5200
    })
})

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["new_text"],
        truncation=True,
        padding="max_length"
    )


tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/18720 [00:00<?, ? examples/s]

Map:   0%|          | 0/2080 [00:00<?, ? examples/s]

Map:   0%|          | 0/5200 [00:00<?, ? examples/s]

In [None]:
id2label = {0: "real", 1: "fake"}
label2id = {"real": 0, "fake": 1}

model = RobertaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    f1 = f1_score(labels, preds)

    return {
        'accuracy': accuracy,
        'auc': auc,
        'f1_score': f1
    }

In [None]:
pip install --upgrade transformers accelerate



In [None]:
# Adjust TrainingArguments
training_args = TrainingArguments(
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    output_dir="./logs",
    report_to="none",
    seed=27,
    data_seed=27,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    eval_steps=LOG_AND_EVAL_STEPS,
    logging_steps=LOG_AND_EVAL_STEPS,
    save_steps=2 * LOG_AND_EVAL_STEPS,
    metric_for_best_model="accuracy"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.evaluate()
trainer.save_model("roberta-news")

Step,Training Loss,Validation Loss,Accuracy,Auc,F1 Score
1000,0.0402,0.010906,0.999038,0.999037,0.999074
2000,0.0149,0.005963,0.999519,0.9995,0.999537
3000,0.0134,7e-06,1.0,1.0,1.0
4000,0.0154,0.010548,0.998558,0.998611,0.998609
5000,0.0124,0.010008,0.999038,0.999074,0.999073
6000,0.0054,0.005343,0.999519,0.999537,0.999537
