In [22]:
import pandas as pd, pyarrow
from pathlib import Path
from sympy import init_printing
init_printing()

In [13]:
RAW_PATH = Path("/Users/angryissues/python_projects/ML/data/raw/News_Category_Dataset_v3.json")
df = pd.read_json(RAW_PATH, lines=True)

df["text"] = df["headline"] + " " + df["short_description"]
df[["category", "text", "date"]].head()

Unnamed: 0,category,text,date
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,2022-09-23
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",2022-09-23
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,2022-09-23
3,PARENTING,The Funniest Tweets From Parents This Week (Se...,2022-09-23
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,2022-09-22


In [14]:
df["date"] = pd.to_datetime(df["date"])

In [2]:
import nltk, json
from nltk.corpus import stopwords

In [3]:
nltk.download("stopwords")
STOPWORDS = set(stopwords.words("english"))

CUSTOM_STOP = {"u", "us", "’s", "“", "”", "–", "…"}
STOPWORDS.update(CUSTOM_STOP)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/angryissues/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
import re, emoji, spacy
from string import punctuation
from pathlib import Path

In [5]:
NLP = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [6]:
URL_RE   = re.compile(r"https?://\S+|www\.\S+")
HTML_RE  = re.compile(r"<.*?>")
NUM_RE   = re.compile(r"\d+")
PUNCT_TBL = str.maketrans("", "", punctuation)

In [7]:
def clean_text(text: str) -> list[str]:
    text = text.lower()                    # 1) lowercase
    text = URL_RE.sub(" ", text)           # 2) remove urls
    text = HTML_RE.sub(" ", text)          # 3) strip html tags
    text = emoji.replace_emoji(text, " ")  # 4) drop emoji
    text = NUM_RE.sub(" ", text)           # 5) drop pure numbers
    text = text.translate(PUNCT_TBL)       # 6) drop punctuation

    doc = NLP(text)                        # 7) tokenize & lemmatize
    tokens = [
        tok.lemma_ for tok in doc
        if tok.is_alpha                  # только буквенные
        and tok.lemma_ not in STOPWORDS  # не стоп-слова
        and len(tok) > 2                 # длина ≥ 3 символа
    ]
    return tokens

In [9]:
from tqdm.auto import tqdm
tqdm.pandas()

In [15]:
df["tokens"] = df["text"].progress_apply(clean_text)

100%|██████████| 209527/209527 [06:30<00:00, 536.34it/s]


Промежуточный результат

In [16]:
df[["headline", "tokens"]].head(2)

Unnamed: 0,headline,tokens
0,Over 4 Million Americans Roll Up Sleeves For O...,"[million, americans, roll, sleeve, omicrontarg..."
1,"American Airlines Flyer Charged, Banned For Li...","[american, airlines, flyer, charge, ban, life,..."


In [20]:
PROCESSED = Path("data/raw/news_tokens.parquet")
df[["category", "date", "tokens"]].to_parquet(PROCESSED, index=False)

In [23]:
# Сколько слов в словаре
vocab = {tok for row in df["tokens"] for tok in row}
print("Vocabulary size:", len(vocab))

Vocabulary size: 94238


In [24]:
# Средняя длина документа
print("Avg tokens per doc:", df["tokens"].str.len().mean())

Avg tokens per doc: 16.89762178621371
