In [1]:

!pip install -q kagglehub spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-tra

In [2]:

import kagglehub, os, pandas as pd


path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")
print("Dataset files are in:", path)
print(os.listdir(path))


fake = pd.read_csv(os.path.join(path, "Fake.csv"))
true = pd.read_csv(os.path.join(path, "True.csv"))


fake["label"] = 0   # Fake
true["label"] = 1   # Real
df = pd.concat([fake, true], ignore_index=True)

print(df.shape)
df.head(2)

Downloading from https://www.kaggle.com/api/v1/datasets/download/clmentbisaillon/fake-and-real-news-dataset?dataset_version_number=1...


100%|██████████| 41.0M/41.0M [00:00<00:00, 148MB/s]

Extracting files...





Dataset files are in: /root/.cache/kagglehub/datasets/clmentbisaillon/fake-and-real-news-dataset/versions/1
['True.csv', 'Fake.csv']
(44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0


In [3]:

import spacy


nlp_sm = spacy.load("en_core_web_sm")


nlp_trf = spacy.load("en_core_web_trf")

In [4]:

sample_text = df.loc[10, "text"]
print(sample_text[:500], "...")


doc_sm = nlp_sm(sample_text)
print("\nEntities (small model):")
for ent in doc_sm.ents:
    print(ent.text, "->", ent.label_)


doc_trf = nlp_trf(sample_text)
print("\nEntities (transformer model):")
for ent in doc_trf.ents:
    print(ent.text, "->", ent.label_)

A centerpiece of Donald Trump s campaign, and now his presidency, has been his white supremacist ways. That is why so many of the public feuds he gets into involve people of color. One of his favorite targets, is, of course, the players in the National Football League who dare to exercise their First Amendment rights by kneeling during the national anthem in protest of racist police brutality. Well, there is one person who has figured out that racism is bad for business, even if it did get the o ...

Entities (small model):
Donald Trump -> PERSON
One -> CARDINAL
the National Football League -> ORG
First Amendment -> LAW
one -> CARDINAL
Papa John s.This -> PERSON
doesn -> PERSON
Papa John Schnatter -> PERSON
Papa John's -> PERSON
John Schnatter -> PERSON
weeks -> DATE
NFL -> ORG
AP Business News -> ORG
December 21 -> DATE
2017The -> CARDINAL
boycotts -> GPE
NFL -> ORG
America -> GPE
the Statue of Liberty -> FAC
Trump -> PERSON
Schnatter -> PERSON
Rob Kim/Getty Images -> PERSON

Entities

In [5]:
from spacy import displacy


displacy.render(doc_trf, style="ent", jupyter=True)

In [6]:

for i in range(5):
    text = df[df["label"]==1].iloc[i]["text"]
    doc = nlp_sm(text)
    print(f"\n🔹 Article {i+1}")
    print([ (ent.text, ent.label_) for ent in doc.ents ])


🔹 Article 1
[('WASHINGTON', 'GPE'), ('Reuters', 'ORG'), ('Republican', 'NORP'), ('the U.S. Congress', 'ORG'), ('this month', 'DATE'), ('Sunday', 'DATE'), ('2018', 'DATE'), ('Republicans', 'NORP'), ('U.S.', 'GPE'), ('Mark Meadows', 'PERSON'), ('CBS', 'ORG'), ('Face the Nation', 'WORK_OF_ART'), ('January', 'DATE'), ('the holidays on', 'DATE'), ('Wednesday', 'DATE'), ('November', 'DATE'), ('Republicans', 'NORP'), ('Congress', 'ORG'), ('Donald Trump', 'PERSON'), ('Republicans', 'NORP'), ('Democrats', 'NORP'), ('Trump', 'ORG'), ('about 7 percent', 'PERCENT'), ('House', 'ORG'), ('Democrats', 'NORP'), ('10 to 11 percent', 'PERCENT'), ('Republicans', 'NORP'), ('late December', 'DATE'), ('about $1.5 trillion', 'MONEY'), ('10 years', 'DATE'), ('$20 trillion', 'MONEY'), ('Mark', 'PERSON'), ('Democratic', 'NORP'), ('U.S.', 'GPE'), ('Joseph Crowley', 'PERSON'), ('CBS', 'ORG'), ('Crowley', 'ORG'), ('Republican', 'NORP'), ('United States', 'GPE'), ('$1.5 trillion', 'MONEY'), ('the House of Represent

In [8]:

import re

def regex_rule_based_entities(text):
    results = {}


    results["dates"] = re.findall(r"\b(?:\d{1,2}\/\d{1,2}\/\d{2,4}|[A-Z][a-z]+ \d{4})\b", text)


    results["money"] = re.findall(r"[$€£]\s?\d+(?:,\d{3})*(?:\.\d{2})?", text)


    results["emails"] = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)


    results["percentages"] = re.findall(r"\d+(\.\d+)?\s?%", text)


    results["capitalized"] = re.findall(r"\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b", text)

    return results



sample_text = df.loc[150, "text"]
print("Sample text (preview):\n", sample_text[:300], "...\n")

entities = regex_rule_based_entities(sample_text)
print("🔹 Rule-based Entities (Regex):")
for k, v in entities.items():
    print(f"{k}: {v[:10]}")

Sample text (preview):
 When Donald Trump allowed popular Nazi blog The Gateway Pundit to have a correspondent show up at the White House, many questioned that decision   but if The Donald was going for comic relief, we re forced to admit that the inclusion of Lucian Wintrich is a complete success.Recently, the White House ...

🔹 Rule-based Entities (Regex):
dates: []
money: []
emails: []
percentages: []
capitalized: ['When Donald Trump', 'Nazi', 'The Gateway Pundit', 'White House', 'The Donald', 'Lucian Wintrich', 'Recently', 'White House', 'Twitter', 'Krang']
