In [1]:
BASE_DIR = r"D:\Columbia\Fall2025\5400\project"

RAW_PARQUET      = fr"{BASE_DIR}\news_raw.parquet"
CLEAN_PARQUET    = fr"{BASE_DIR}\news_clean.parquet"
NLP_INPUT_PARQUET  = fr"{BASE_DIR}\nlp_input.parquet"
NLP_OUTPUT_PARQUET = fr"{BASE_DIR}\nlp_enriched.parquet"


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id

spark = (
    SparkSession.builder
    .appName("5400-news-elt")
    .master("local[*]")
    .config("spark.driver.memory", "10g")     # 或 8g，看你机器内存
    .config("spark.executor.memory", "10g")
    .config("spark.sql.shuffle.partitions", "200")
    .config("spark.default.parallelism", "200")
    .config("spark.sql.parquet.compression.codec", "snappy")
    .getOrCreate()
)


BASE_DIR = r"D:\Columbia\Fall2025\5400\project"
RAW_PARQUET   = fr"{BASE_DIR}\news_raw.parquet"

# 1. 读原始 CSV
csv_path = r"D:\Columbia\Fall2025\5400\project\All_external.csv"  # 按你实际路径改
news_df = spark.read.csv(csv_path, header=True, inferSchema=True)

# print("原始行数:", news_df.count())

# 2. 加唯一 ID（以后 NLP join 要用）
news_df = spark.read.option("header", True).csv(csv_path)

# 先重分区
news_df = news_df.repartition(200)

news_df = news_df.withColumn("news_id", monotonically_increasing_id())

news_df.write.mode("overwrite").parquet(RAW_PARQUET)

In [3]:
from pyspark.sql.functions import col

BASE_DIR = r"D:\Columbia\Fall2025\5400\project"
RAW_PARQUET        = fr"{BASE_DIR}\news_raw.parquet"
CLEAN_PARQUET      = fr"{BASE_DIR}\news_clean.parquet"
NLP_INPUT_PARQUET  = fr"{BASE_DIR}\nlp_input.parquet"

news_df = spark.read.parquet(RAW_PARQUET)

# 1. 保留 Article 非空的
clean_df = news_df.filter(col("Article").isNotNull())
print("去除 Article 空值后行数:", clean_df.count())

# 2. 按需要下采样（比如先 100000 行）
nlp_input_df = clean_df.limit(100000)   # 你可以改成更多，比如 500000

# 3. 保存 clean layer（全量）
clean_df.write.mode("overwrite").parquet(CLEAN_PARQUET)

# 4. 保存 NLP 输入子集（带 news_id + 所有你关心的列）
nlp_input_df.write.mode("overwrite").parquet(NLP_INPUT_PARQUET)

print("clean parquet:", CLEAN_PARQUET)
print("nlp_input parquet:", NLP_INPUT_PARQUET)


去除 Article 空值后行数: 839348
clean parquet: D:\Columbia\Fall2025\5400\project\news_clean.parquet
nlp_input parquet: D:\Columbia\Fall2025\5400\project\nlp_input.parquet


In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
GPU name: NVIDIA GeForce RTX 5070 Ti Laptop GPU


In [4]:
import os
import pandas as pd
from tqdm import tqdm
import torch
from transformers import pipeline

BASE_DIR = r"D:\Columbia\Fall2025\5400\project"
NLP_INPUT_PARQUET  = fr"{BASE_DIR}\nlp_input.parquet"
NLP_OUTPUT_PARQUET = fr"{BASE_DIR}\nlp_enriched.parquet"
NLP_OUTPUT_CSV     = fr"{BASE_DIR}\nlp_enriched.csv"

os.makedirs(BASE_DIR, exist_ok=True)

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("device count:", torch.cuda.device_count())
device = 0 if torch.cuda.is_available() and torch.cuda.device_count() > 0 else -1
print("using device =", device)


torch: 2.9.1+cu128
cuda available: True
device count: 1
using device = 0


In [5]:
df = pd.read_parquet(NLP_INPUT_PARQUET)
print(df.shape)
print(df.columns)


(100000, 12)
Index(['Date', 'Article_title', 'Stock_symbol', 'Url', 'Publisher', 'Author',
       'Article', 'Lsa_summary', 'Luhn_summary', 'Textrank_summary',
       'Lexrank_summary', 'news_id'],
      dtype='object')


In [6]:
# 情感分析：英文 binary sentiment
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=device
)

# 摘要模型：DistilBART
summ_pipe = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    device=device
)


Device set to use cuda:0
Device set to use cuda:0


In [7]:
def batch_summarize(texts, max_length=60, min_length=10):
    """输入 list[str]，返回 list[str] 的摘要结果"""
    # 处理空值
    cleaned = [t if isinstance(t, str) and t.strip() != "" else "" for t in texts]
    # Huggingface pipeline 支持 list 输入
    outputs = summ_pipe(
        cleaned,
        max_length=max_length,
        min_length=min_length,
        truncation=True
    )
    return [o["summary_text"] for o in outputs]

def batch_sentiment(texts):
    """输入 list[str]，返回 (label, score) 两个 list"""
    cleaned = [t if isinstance(t, str) and t.strip() != "" else "" for t in texts]
    outputs = sentiment_pipe(cleaned)
    labels = [o["label"] for o in outputs]
    scores = [o["score"] for o in outputs]
    return labels, scores


In [8]:
BATCH_SIZE = 16

articles = df["Article"].fillna("").astype(str).tolist()
n = len(articles)

summary_list = []
sent_label_list = []
sent_score_list = []

for i in tqdm(range(0, n, BATCH_SIZE), desc="Running NLP"):
    batch = articles[i:i+BATCH_SIZE]

    # 摘要
    batch_summ = batch_summarize(batch, max_length=80, min_length=15)
    # 情感
    batch_labels, batch_scores = batch_sentiment(batch)

    summary_list.extend(batch_summ)
    sent_label_list.extend(batch_labels)
    sent_score_list.extend(batch_scores)

# 加回 df
df["article_summary"] = summary_list
df["sentiment_label"] = sent_label_list
df["sentiment_score"] = sent_score_list


Running NLP:   0%|          | 0/6250 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (712 > 512). Running this sequence through the model will result in indexing errors
Running NLP:   0%|          | 0/6250 [00:08<?, ?it/s]


RuntimeError: The size of tensor a (712) must match the size of tensor b (512) at non-singleton dimension 1