In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

import nltk, re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier


In [None]:
df = pd.read_csv('src/fake_news_dataset.csv')

In [None]:
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)
X_tfidf = tfidf.fit_transform(df['text'])
y = df['label'].map({'fake': 1, 'real': 0}).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
clf_lr = LogisticRegression(
    max_iter=200,
    C=2.0,
    n_jobs=-1
)
clf_lr.fit(X_train, y_train)
pred = clf_lr.predict(X_test)

In [None]:
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)
prec = precision_score(y_test, pred)
recall = recall_score(y_test, pred)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.5035
f1 = 0.5165530671859786
precision_score = 0.5059608965188365
recall_score = 0.5275982098458478


In [None]:
clf_svm = LinearSVC()
clf_svm.fit(X_train, y_train)
pred = clf_svm.predict(X_test)

In [None]:
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)
prec = precision_score(y_test, pred)
recall = recall_score(y_test, pred)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.5045
f1 = 0.5146914789422136
precision_score = 0.5069946936806561
recall_score = 0.5226255594231726


In [None]:
rf = RandomForestClassifier(
    n_estimators=300,
    class_weight='balanced'
)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

In [None]:
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)
prec = precision_score(y_test, pred)
recall = recall_score(y_test, pred)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.512
f1 = 0.5601622352410996
precision_score = 0.5121549237742068
recall_score = 0.618100447538538


In [None]:
stop_words = set(stopwords.words('english'))
stop_words.update(['mr', 'else'])
lemmatizer = WordNetLemmatizer()

def preprocess(text, reg=r'[^a-zA-Z\s]'):
    text = re.sub(reg, '', text.lower())
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

In [None]:
df['tokens'] = df['text'].apply(preprocess)

In [None]:
labels = df['label'].tolist()

feature_names = np.array(tfidf.get_feature_names_out())

def top_tfidf_words(label_value, top_n=20):
    idx = np.where(np.array(labels) == label_value)[0]
    class_tfidf = X_tfidf[idx].mean(axis=0).A1
    top_idx = np.argsort(class_tfidf)[::-1][:top_n]
    return pd.DataFrame({
        'term': feature_names[top_idx],
        'mean_tfidf': class_tfidf[top_idx]
    })

top_fake = top_tfidf_words('fake')
top_real = top_tfidf_words('real')

In [None]:
fake_terms = top_fake['term'].tolist()
real_terms = top_real['term'].tolist()

fake_set = set(fake_terms)
real_set = set(real_terms)

def has_fake_tfidf(tokens):
    return int(any(t in fake_set for t in tokens))

def has_real_tfidf(tokens):
    return int(any(t in real_set for t in tokens))

df['has_fake_tfidf'] = df['tokens'].apply(has_fake_tfidf)
df['has_real_tfidf'] = df['tokens'].apply(has_real_tfidf)

In [None]:
def tfidf_counts(tokens):
    c = Counter(tokens)
    fake_count = sum(c[w] for w in fake_set)
    real_count = sum(c[w] for w in real_set)
    return pd.Series({'fake_tfidf_count': fake_count,
                      'real_tfidf_count': real_count})

df[['fake_tfidf_count', 'real_tfidf_count']] = df['tokens'].apply(tfidf_counts)

df['len_tokens'] = df['tokens'].apply(len)
df['fake_tfidf_frac'] = df['fake_tfidf_count'] / df['len_tokens'].clip(lower=1)
df['real_tfidf_frac'] = df['real_tfidf_count'] / df['len_tokens'].clip(lower=1)

In [None]:
e_cols = ['has_fake_tfidf', 'has_real_tfidf',
              'fake_tfidf_frac', 'real_tfidf_frac', 'len_tokens']
e_X = df[e_cols].to_numpy().astype(float)

In [None]:
X = hstack([X_tfidf, e_X])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
clf = LogisticRegression(
    max_iter=200,
    C=2.0,
    n_jobs=-1
)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [None]:
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)
prec = precision_score(y_test, pred)
recall = recall_score(y_test, pred)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.507
f1 = 0.5201946472019465
precision_score = 0.5092901381610291
recall_score = 0.5315763301839881


In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BertTokenizer,
    BertForSequenceClassification
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch




In [None]:
df['label_id'] = df['label'].map({'real': 0, 'fake': 1}).astype(int)

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label_id']
)

train_ds = Dataset.from_pandas(train_df[['text', 'label_id']])
test_ds  = Dataset.from_pandas(test_df[['text', 'label_id']])


In [None]:
MODEL_NAME = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 128

def tokenize_batch(batch):
    return tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN
    )

train_ds_tok = train_ds.map(tokenize_batch, batched=True)
test_ds_tok  = test_ds.map(tokenize_batch, batched=True)

train_ds_tok = train_ds_tok.rename_column('label_id', 'label')
test_ds_tok  = test_ds_tok.rename_column('label_id', 'label')

train_ds_tok.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label']
)
test_ds_tok.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label']
)


Map: 100%|██████████| 16000/16000 [00:02<00:00, 6881.18 examples/s]
Map: 100%|██████████| 4000/4000 [00:00<00:00, 7818.87 examples/s]


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

batch_size = 8

training_args = TrainingArguments(
    output_dir='lesson31',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=0.001,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none"
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        "precision_fake": report["1"]["precision"],
        "recall_fake": report["1"]["recall"],
        "f1_fake": report["1"]["f1-score"],
        "accuracy": report["accuracy"],
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tok,
    eval_dataset=test_ds_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

pred_out = trainer.predict(test_ds_tok)
logits = pred_out.predictions
y_true = pred_out.label_ids
y_pred = np.argmax(logits, axis=-1)

classification_report(y_true, y_pred, target_names=['real', 'fake'])

Epoch,Training Loss,Validation Loss,Precision Fake,Recall Fake,F1 Fake,Accuracy
1,0.6946,0.693135,0.50275,1.0,0.669107,0.50275
2,0.6932,0.693345,0.50275,1.0,0.669107,0.50275
3,0.6932,0.693133,0.50275,1.0,0.669107,0.50275


'              precision    recall  f1-score   support\n\n        real       0.00      0.00      0.00      1989\n        fake       0.50      1.00      0.67      2011\n\n    accuracy                           0.50      4000\n   macro avg       0.25      0.50      0.33      4000\nweighted avg       0.25      0.50      0.34      4000\n'

С БЕРТ тоже печаль...