In [1]:
! pip install pandas nltk beautifulsoup4 contractions datasets

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-

In [2]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import contractions
from collections import Counter
from datasets import load_dataset

# 确保下载所需的 NLTK 资源
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
# 加载 AG News 数据集
dataset = load_dataset('ag_news')
train_data = dataset['train']
test_data = dataset['test']

# 将 Dataset 转换为 Pandas DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [4]:
# 定义停用词，词形还原工具
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 预处理函数
def preprocess_text(text):
    # 去除HTML标签
    text = BeautifulSoup(text, "html.parser").get_text()

    # 扩展缩写
    text = contractions.fix(text)

    # 去除特殊字符和数字
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 转为小写
    text = text.lower()

    # 分词
    tokens = word_tokenize(text)

    # 去除停用词
    tokens = [word for word in tokens if word not in stop_words]

    # 词形还原
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 去除多余的空格并重组文本
    text = ' '.join(tokens)

    return text

In [8]:
!pip install tqdm



In [10]:
from tqdm import tqdm

# 添加进度条并应用预处理函数
tqdm.pandas(desc="Processing train dataset")
train_df['processed_text'] = train_df['text'].progress_apply(preprocess_text)

# 去除重复文本
train_df = train_df.drop_duplicates(subset=['processed_text'])

# 保存预处理后的数据
# train_df.to_csv('preprocessed_ag_news_train.csv', index=False)

# 输出预处理后的数据
train_df

  text = BeautifulSoup(text, "html.parser").get_text()
Processing train dataset: 100%|██████████| 119523/119523 [01:04<00:00, 1854.25it/s]


Unnamed: 0,text,label,processed_text
0,Wall St. Bears Claw Back Into the Black (Reute...,2,wall st bear claw back black reuters reuters s...
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2,carlyle look toward commercial aerospace reute...
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2,oil economy cloud stock outlook reuters reuter...
3,Iraq Halts Oil Exports from Main Southern Pipe...,2,iraq halt oil export main southern pipeline re...
4,"Oil prices soar to all-time record, posing new...",2,oil price soar alltime record posing new menac...
...,...,...,...
119995,Pakistan's Musharraf Says Won't Quit as Army C...,0,pakistan musharraf say quit army chief karachi...
119996,Renteria signing a top-shelf deal Red Sox gene...,1,renteria signing topshelf deal red sox general...
119997,Saban not going to Dolphins yet The Miami Dolp...,1,saban going dolphin yet miami dolphin put cour...
119998,Today's NFL games PITTSBURGH at NY GIANTS Time...,1,today nfl game pittsburgh ny giant time pm lin...


In [11]:
from tqdm import tqdm

# 添加进度条并应用预处理函数
tqdm.pandas(desc="Processing test dataset")
test_df['processed_text'] = test_df['text'].progress_apply(preprocess_text)

# 去除重复文本
test_df = test_df.drop_duplicates(subset=['processed_text'])

# # 保存预处理后的数据
# test_df.to_csv('preprocessed_ag_news_test.csv', index=False)

# 输出预处理后的数据
test_df

  text = BeautifulSoup(text, "html.parser").get_text()
Processing test dataset: 100%|██████████| 7599/7599 [00:03<00:00, 2124.51it/s]


Unnamed: 0,text,label,processed_text
0,Fears for T N pension after talks Unions repre...,2,fear n pension talk union representing worker ...
1,The Race is On: Second Private Team Sets Launc...,3,race second private team set launch date human...
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3,ky company win grant study peptide ap ap compa...
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3,prediction unit help forecast wildfire ap ap b...
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3,calif aim limit farmrelated smog ap ap souther...
...,...,...,...
7595,Around the world Ukrainian presidential candid...,0,around world ukrainian presidential candidate ...
7596,Void is filled with Clement With the supply of...,1,void filled clement supply attractive pitching...
7597,Martinez leaves bitter Like Roger Clemens did ...,1,martinez leaf bitter like roger clemens almost...
7598,5 of arthritis patients in Singapore take Bext...,2,arthritis patient singapore take bextra celebr...
