In [3]:
import pandas as pd

nlp_layer_path = r"D:\Columbia\Fall2025\5400\project\layer\silver_for_nlp"
df = pd.read_parquet(nlp_layer_path)

print("原始行数:", len(df))
print("列名：", df.columns.tolist())

cols = ["Article_title", "Article", "Lsa_summary", "Luhn_summary", "Textrank_summary", "Lexrank_summary"]

print("\n====== 各列非空数量 ======")
for col in cols:
    if col in df.columns:
        n_nonnull = df[col].notna().sum()
        print(f"{col}: {n_nonnull}")

print("\n====== 各列非空前 3 行 ======")
for col in cols:
    if col in df.columns:
        print(f"\n=== {col}（非空前 3 行） ===")
        nonnull_rows = df[df[col].notna() & (df[col].astype(str).str.strip() != "")]
        print(nonnull_rows[col].head(3))


原始行数: 3242351
列名： ['Date', 'Article_title', 'Stock_symbol', 'Publisher', 'Author', 'Url', 'Article']

Article_title: 3242351
Article: 0


=== Article_title（非空前 3 行） ===
0                     LightInTheBox reports Q3 results
1    Qualcomm Looks to Reject Broadcom Offer, and M...
2         Top Performing Industries For April 21, 2016
Name: Article_title, dtype: object

=== Article（非空前 3 行） ===
Series([], Name: Article, dtype: object)


In [4]:
import torch

print("torch:", torch.__version__, "cuda:", torch.cuda.is_available())
print("device count:", torch.cuda.device_count())

if torch.cuda.is_available() and torch.cuda.device_count() > 0:
    print("device 0 name:", torch.cuda.get_device_name(0))
else:
    print("⚠ 当前 PyTorch 没有可用的 CUDA 设备，将使用 CPU。")

device = 0 if (torch.cuda.is_available() and torch.cuda.device_count() > 0) else -1
print("using device =", device)


torch: 2.9.1+cu128 cuda: True
device count: 1
device 0 name: NVIDIA GeForce RTX 5070 Ti Laptop GPU
using device = 0


In [14]:
from sentence_transformers import SentenceTransformer
import torch


model = SentenceTransformer("all-mpnet-base-v2")
print(model.device)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


KeyboardInterrupt: 

In [2]:
import torch
print(torch.cuda.is_available())


True


In [1]:
import torch
print(torch.cuda.get_device_name(0))


NVIDIA GeForce RTX 5070 Ti Laptop GPU


In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")
emb = model.encode("hello world")
print(type(emb), len(emb))


  from .autonotebook import tqdm as notebook_tqdm


<class 'numpy.ndarray'> 384


In [13]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"# 用GPU
os.environ["HF_HOME"] = r"D:\Columbia\huggingface_cache"  # 保证在 import 之前

from transformers import pipeline
import torch

device = 0
print("device =", device)

# ① 情感分析（英文）
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=device
)

# ② 摘要模型 —— 换成小的 DistilBART，而不是 facebook/bart-large-cnn
summ_pipe = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    device=device
)

txt = "The market fell sharply today, but some analysts remain optimistic about the long-term outlook."
print("sentiment:", sentiment_pipe(txt))
print("summary:", summ_pipe(txt, max_length=40, min_length=10, truncation=True)[0]["summary_text"])


device = 0


Device set to use cuda:0
Device set to use cuda:0
Your max_length is set to 40, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


sentiment: [{'label': 'POSITIVE', 'score': 0.9784606099128723}]
summary:  The market fell sharply today, but some analysts remain optimistic about the long-term outlook .


In [6]:
import os
import pandas as pd
from transformers import pipeline
import torch
from tqdm import tqdm

OUT_DIR = r"D:\Columbia\project_layers"
os.makedirs(OUT_DIR, exist_ok=True)   # ⭐ 先确保目录存在

OUT_PARQUET = os.path.join(OUT_DIR, "nlp_enriched_sample.parquet")
OUT_CSV     = os.path.join(OUT_DIR, "nlp_enriched_sample.csv")


# 1. 一些环境变量（可选）
os.environ["CUDA_VISIBLE_DEVICES"] = ""   # 强制不用 GPU
os.environ["HF_HOME"] = r"D:\Columbia\huggingface_cache"  # 模型缓存位置

print("torch version:", torch.__version__)
print("cuda available:", torch.cuda.is_available())

# 2. 读取 Spark 导出的 Parquet（注意：这里是 Windows 路径，不要 file:///）
NLP_LAYER_DIR = r"D:\Columbia\Fall2025\5400\project\layer\silver_for_nlp"

df = pd.read_parquet(NLP_LAYER_DIR)
print("原始行数:", len(df))

df = df.dropna(subset=["Article_title"]).reset_index(drop=True)
print("去除 article 空值后:", len(df))

# 为了先验证流程，可以先抽样一小部分
df_sample = df.sample(n=500, random_state=1031).reset_index(drop=True)

# 3. 准备两个 pipeline（都在 CPU 上）
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=-1
)

summ_pipe = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    device=-1
)

# 4. 对每一篇文章跑情感 + 摘要
sentiment_labels = []
sentiment_scores = []
summaries = []

for text in tqdm(df_sample["Article"].fillna(""), desc="Running NLP"):
    txt = text.strip()
    if not txt:
        sentiment_labels.append("NEUTRAL")
        sentiment_scores.append(0.0)
        summaries.append("")
        continue

    # 情感（先截断一点，避免特别长）
    try:
        s = sentiment_pipe(txt[:512])[0]
        sentiment_labels.append(s["label"])
        sentiment_scores.append(float(s["score"]))
    except Exception as e:
        sentiment_labels.append("ERROR")
        sentiment_scores.append(0.0)

    # 摘要
    try:
        sum_text = summ_pipe(
            txt,
            max_length=80,
            min_length=20,
            truncation=True
        )[0]["summary_text"]
    except Exception as e:
        sum_text = ""
    summaries.append(sum_text)

# 5. 写回 DataFrame
df_sample["sentiment_label"] = sentiment_labels
df_sample["sentiment_score"] = sentiment_scores
df_sample["article_summary"] = summaries

# 6. 保存结果
OUT_PARQUET = r"D:\Columbia\project_layers\nlp_enriched_sample.parquet"
OUT_CSV = r"D:\Columbia\project_layers\nlp_enriched_sample.csv"

df_sample.to_parquet(OUT_PARQUET, index=False)
df_sample.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

print("已保存：")
print("  ", OUT_PARQUET)
print("  ", OUT_CSV)

print(df_sample[["Article_title", "sentiment_label", "sentiment_score", "article_summary"]].head())


torch version: 2.9.1+cu128
cuda available: True
原始行数: 3242351
去除 article 空值后: 3242351


Device set to use cpu
Device set to use cpu
Running NLP: 100%|██████████| 500/500 [00:00<?, ?it/s]

已保存：
   D:\Columbia\project_layers\nlp_enriched_sample.parquet
   D:\Columbia\project_layers\nlp_enriched_sample.csv
                                       Article_title sentiment_label  \
0  IDEXX (IDXX) Q4 Earnings Top Estimates, Raises...         NEUTRAL   
1          Orbitz Rewards Program Exceeds 4M Members         NEUTRAL   
2                    Gabelli Maintains Buy on Cameco         NEUTRAL   
3         Spain 10-Year Government Bond Yield 6.674%         NEUTRAL   
4  Benchmark Reiterates Sell on Marathon Oil, Red...         NEUTRAL   

   sentiment_score article_summary  
0              0.0                  
1              0.0                  
2              0.0                  
3              0.0                  
4              0.0                  





In [8]:
print(df_sample[["Article_title", "sentiment_label", "sentiment_score", "article_summary"]].head(50))

                                        Article_title sentiment_label  \
0   IDEXX (IDXX) Q4 Earnings Top Estimates, Raises...         NEUTRAL   
1           Orbitz Rewards Program Exceeds 4M Members         NEUTRAL   
2                     Gabelli Maintains Buy on Cameco         NEUTRAL   
3          Spain 10-Year Government Bond Yield 6.674%         NEUTRAL   
4   Benchmark Reiterates Sell on Marathon Oil, Red...         NEUTRAL   
5   Huntington to Strengthen Michigan Network - An...         NEUTRAL   
6            OncoGenex Pharmaceuticals beats by $0.14         NEUTRAL   
7                 BGC Partners Reiterates Q2 Forecast         NEUTRAL   
8   Stock Upgrades: Molina Healthcare Shows Rising...         NEUTRAL   
9   NFLX And Chill With These 8 Wall Street Shows ...         NEUTRAL   
10  Barron's Recap: The World's Most Respected Com...         NEUTRAL   
11                      44 Biggest Movers From Friday         NEUTRAL   
12  DPW Holdings, Inc. Late Tuesday Announced Noti.

In [9]:
print(df_sample[["sentiment_label"]].value_counts())

sentiment_label
NEUTRAL            500
Name: count, dtype: int64
