In [1]:
# 필요 패키지 설치: pip install feedparser pandas
import feedparser
from datetime import datetime, timedelta
import pandas as pd

In [3]:
def generate_google_rss_url(query, start_date, end_date):
    base_url = "https://news.google.com/rss/search?"
    q = f"q={query}+after:{start_date}+before:{end_date}"
    params = "&hl=en-US&gl=US&ceid=US:en"
    return base_url + q + params

def fetch_news_rss_day(query, day: datetime):
    start_date = day.strftime("%Y-%m-%d")
    end_date = (day + timedelta(days=1)).strftime("%Y-%m-%d")
    url = generate_google_rss_url(query, start_date, end_date)
    feed = feedparser.parse(url)

    items = []
    for entry in feed.entries:
        try:
            pub_date = datetime(*entry.published_parsed[:6])
        except Exception:
            continue
        items.append({
            "date": pub_date.strftime("%Y-%m-%d"),
            "title": entry.title,
            "link": entry.link,
            "source": entry.source.title if "source" in entry else "Unknown"
        })

    return items

def get_news_data(start_day):
    # ✅ 수집 범위: 2025-01-01 ~ 오늘까지
    end_day = datetime.now()

    all_news = []
    current_day = start_day

    while current_day <= end_day:
        daily_news = fetch_news_rss_day("Tesla", current_day)
        all_news.extend(daily_news)
        print(f"✅ {current_day.strftime('%Y-%m-%d')} - {len(daily_news)} items")
        current_day += timedelta(days=1)
    return all_news

def remove_duplicate_titles_by_prefix(df, prefix_length=50):
    seen = set()
    keep_rows = []

    for _, row in df.iterrows():
        prefix = row["title"][:prefix_length].strip().lower()
        if prefix not in seen:
            seen.add(prefix)
            keep_rows.append(row)

    return pd.DataFrame(keep_rows).reset_index(drop=True)

In [4]:
all_news = get_news_data(datetime(2025, 1, 1))

✅ 2025-01-01 - 100 items
✅ 2025-01-02 - 100 items
✅ 2025-01-03 - 80 items
✅ 2025-01-04 - 21 items
✅ 2025-01-05 - 31 items
✅ 2025-01-06 - 78 items
✅ 2025-01-07 - 87 items
✅ 2025-01-08 - 73 items
✅ 2025-01-09 - 100 items
✅ 2025-01-10 - 100 items
✅ 2025-01-11 - 37 items
✅ 2025-01-12 - 57 items
✅ 2025-01-13 - 85 items
✅ 2025-01-14 - 73 items
✅ 2025-01-15 - 63 items
✅ 2025-01-16 - 57 items
✅ 2025-01-17 - 41 items
✅ 2025-01-18 - 19 items
✅ 2025-01-19 - 30 items
✅ 2025-01-20 - 52 items
✅ 2025-01-21 - 60 items
✅ 2025-01-22 - 73 items
✅ 2025-01-23 - 100 items
✅ 2025-01-24 - 73 items
✅ 2025-01-25 - 24 items
✅ 2025-01-26 - 51 items
✅ 2025-01-27 - 78 items
✅ 2025-01-28 - 100 items
✅ 2025-01-29 - 100 items
✅ 2025-01-30 - 100 items
✅ 2025-01-31 - 47 items
✅ 2025-02-01 - 18 items
✅ 2025-02-02 - 63 items
✅ 2025-02-03 - 99 items
✅ 2025-02-04 - 96 items
✅ 2025-02-05 - 35 items
✅ 2025-02-06 - 53 items
✅ 2025-02-07 - 32 items
✅ 2025-02-08 - 16 items
✅ 2025-02-09 - 28 items
✅ 2025-02-10 - 44 items
✅ 2025-0

In [7]:
# ✅ DataFrame으로 정리 후 저장
df = pd.DataFrame(all_news)
df.sort_values(by="date", inplace=True)
df = df[["date", "title", "link", "source"]]

# ✅ 중복 제거 (title 기준)
df.drop_duplicates(subset="title", inplace=True)

# 2차로 앞부분 50글자 기준 중복 제거
df = remove_duplicate_titles_by_prefix(df, prefix_length=50)

# df.to_csv("tesla_news.csv", index=False)
# print("✅ 저장 완료: tesla_news.csv")


In [8]:
import requests
from bs4 import BeautifulSoup

def extract_article_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        res = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(res.text, 'html.parser')

        # 기사 본문 파싱: <p> 태그 모두 연결
        paragraphs = soup.find_all("p")
        text = '\n'.join(p.get_text() for p in paragraphs)

        return text.strip()
    except Exception as e:
        return "[ERROR] " + str(e)

In [None]:
from tqdm import tqdm

df["content"] = [
    extract_article_text(url) for url in tqdm(df["link"][:10])
]
df.to_csv("tesla_news_with_body_2025.csv", index=False)

100%|██████████| 5998/5998 [1:32:56<00:00,  1.08it/s] 
