In [1]:
import os
from extract import load_csv, load_csv_welfake, load_csv_summary
from filter import clean_text, clean_text_welfake, clean_text_summary
from trans import normalize_text, normalize_text_welfake, normalize_text_summary
from load import save_jsonl
from split import split_data

def run_etl_classification(output_dir):

    df = load_csv()
    df = clean_text(df)
    df = normalize_text(df)
    print(df.head())
    print(df.columns)
    train, val, late = split_data(df)

    os.makedirs(output_dir, exist_ok=True)
    save_jsonl(train, f"{output_dir}/train.jsonl")
    save_jsonl(val, f"{output_dir}/val.jsonl")
    save_jsonl(late, f"{output_dir}/late_data.jsonl")

def run_etl_welfake(output_dir):
    df = load_csv_welfake()
    df = clean_text_welfake(df)
    df = normalize_text_welfake(df)
    print(df.head())
    print(df.columns)
    train, val, late = split_data(df)
    
    os.makedirs(output_dir, exist_ok=True)
    save_jsonl(train, f"{output_dir}/train.jsonl")
    save_jsonl(val, f"{output_dir}/val.jsonl")
    save_jsonl(late, f"{output_dir}/late_data.jsonl")
    
def run_etl_summary(output_dir):
    df = load_csv_summary()
    df = clean_text_summary(df)
    df = normalize_text_summary(df)
    for i, summary in enumerate(df["summary"].head(5)):
        print(f"{i}: {summary}\n")

    train, val, late = split_data(df)
    
    os.makedirs(output_dir, exist_ok=True)
    save_jsonl(train, f"{output_dir}/train.jsonl")
    save_jsonl(val, f"{output_dir}/val.jsonl")
    save_jsonl(late, f"{output_dir}/late_data.jsonl")

if __name__ == "__main__":

    run_etl_classification("classification")
    run_etl_welfake("welfake")
    run_etl_summary("summary")
    

   category                                               text
0  WELLNESS  143 Miles in 35 Days: Lessons Learned Resting ...
1  WELLNESS  Talking to Yourself: Crazy or Crazy Helpful? T...
2  WELLNESS  Crenezumab: Trial Will Gauge Whether Alzheimer...
3  WELLNESS  Oh, What a Difference She Made If you want to ...
4  WELLNESS  Green Superfoods First, the bad news: Soda bre...
Index(['category', 'text'], dtype='object')
Train size: 32939
Val size: 8235
Late data size: 4575
   label                                               text
0      1  LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1      1  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
2      0  Bobby Jindal, raised Hindu, uses story of Chri...
3      1  SATAN 2: Russia unvelis an image of its terrif...
4      1  About Time! Christian Group Sues Amazon and SP...
Index(['label', 'text'], dtype='object')
Train size: 51506
Val size: 12877
Late data size: 7154
0: Harry. Potter star. Daniel. Radcliffe gets £20M fortune as he tur

In [6]:
from datasets import load_dataset

# 加载训练集
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train")

# 转为 pandas DataFrame
df = dataset.to_pandas()

# 保存为 CSV 文件（默认用 UTF-8 编码）
df.to_csv("cnn_dailymail_train.csv", index=False)
# 查看前几条数据
print(dataset[0]) 

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [7]:
from datasets import load_dataset

# 加载训练集
dataset = load_dataset("EdinburghNLP/xsum", split="train")

# 转为 pandas DataFrame
df = dataset.to_pandas()

# 保存为 CSV 文件（默认用 UTF-8 编码）
df.to_csv("xsum_train.csv", index=False)
# 查看前几条数据
print(dataset[0]) 

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 204045/204045 [00:00<00:00, 443393.35 examples/s]
Generating validation split: 100%|██████████| 11332/11332 [00:00<00:00, 422371.19 examples/s]
Generating test split: 100%|██████████| 11334/11334 [00:00<00:00, 379029.36 examples/s]




In [11]:
from datasets import load_dataset
import pandas as pd

# 加载前 50000 条 CNN/DailyMail 样本
cnn = load_dataset("cnn_dailymail", "3.0.0", split="train[:50000]").to_pandas()
cnn = cnn.rename(columns={"article": "document", "highlights": "summary"})
cnn = cnn[["document", "summary"]]  # 只保留这两列

# 加载前 50000 条 XSum 样本
xsum = load_dataset("EdinburghNLP/xsum", split="train[:50000]").to_pandas()
xsum = xsum[["document", "summary"]]  # 只保留这两列

# 合并两个数据集
combined = pd.concat([cnn, xsum], ignore_index=True)

# 保存为 CSV 文件
combined.to_csv("summarization_combined——new.csv", index=False)
print("✅ 成功生成 summarization_combined.csv")


✅ 成功生成 summarization_combined.csv
