In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

project_path = "/content/drive/MyDrive/Colab Notebooks"
os.chdir(project_path)

print(os.getcwd())

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [None]:
!pip install feedparser pandas requests vaderSentiment tqdm

Collecting feedparser
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=c6d2474c09d348f117bc769f481cce0a9cc7b7795f34da2ee43dfb3fc2e17dcb
  Stored in directory: /root/.cach

In [None]:
import feedparser
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
import time

# ---------------------------------
# Date Range
# ---------------------------------
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 12, 31)

all_articles = []

current_date = start_date

while current_date <= end_date:

    next_date = current_date + timedelta(days=1)

    # Google News query with daily date filter
    query = f"https://news.google.com/rss/search?q=Tesla+after:{current_date.strftime('%Y-%m-%d')}+before:{next_date.strftime('%Y-%m-%d')}&hl=en-US&gl=US&ceid=US:en"

    feed = feedparser.parse(query)

    for entry in feed.entries:

        if hasattr(entry, "published_parsed") and entry.published_parsed:
            pub_date = datetime(*entry.published_parsed[:6])
        else:
            continue

        all_articles.append({
            "date": pub_date,
            "headline": entry.title,
            "source": entry.get("source", {}).get("title", "Unknown"),
            "link": entry.link
        })

    current_date = next_date

    # Light rate limiting
    time.sleep(0.5)

# ---------------------------------
# Create DataFrame
# ---------------------------------
df = pd.DataFrame(all_articles)

# Remove duplicates
df = df.drop_duplicates(subset=["headline", "date"])

# Sort chronologically
df = df.sort_values("date")

# Save to CSV
df.to_csv("data_storage/tesla_daily_news_2024_2025.csv", index=False)

print(f"Saved {len(df)} articles.")


Saved 25507 articles.


In [None]:
df.head()

Unnamed: 0,date,headline,source,link
66,2024-01-01 08:00:00,You Can Now Buy A Tesla Model 3 Performance Fo...,InsideEVs,https://news.google.com/rss/articles/CBMiakFVX...
44,2024-01-01 08:00:00,Tesla lashes Reuters for 'wildly misleading' s...,CarExpert,https://news.google.com/rss/articles/CBMiqgFBV...
55,2024-01-01 08:00:00,"Tesla Model 3 Battery Size, Voltage, And Charg...",Top Speed,https://news.google.com/rss/articles/CBMihgFBV...
35,2024-01-01 08:00:00,The Tesla Cybertruck Has Made 0 To 60 MPH Time...,CarBuzz,https://news.google.com/rss/articles/CBMikgFBV...
32,2024-01-01 08:00:00,Tesla's EV Business Faces 'Challenges' In 2024...,Investor's Business Daily,https://news.google.com/rss/articles/CBMiygFBV...


In [None]:
import requests
import pandas as pd
import os
from datetime import datetime

# -------------------------
# Configuration
# -------------------------
CIK = "0001318605"
HEADERS = {"User-Agent": "Ben Goodman ben.goodman@example.com"}
SAVE_DIR = "sec_filings"
os.makedirs(SAVE_DIR, exist_ok=True)

# -------------------------
# Fetch Tesla submissions JSON
# -------------------------
url = f"https://data.sec.gov/submissions/CIK{CIK}.json"
resp = requests.get(url, headers=HEADERS)
data = resp.json()

filings = data["filings"]["recent"]

df = pd.DataFrame({
    "form": filings["form"],
    "filing_date": filings["filingDate"],
    "accession_number": filings["accessionNumber"],
    "primary_document": filings["primaryDocument"]
})

# -------------------------
# Filter 10-K and 10-Q filings from 2024–2025
# -------------------------
df["filing_date"] = pd.to_datetime(df["filing_date"])
df_filtered = df[
    (df["form"].isin(["10-K", "10-Q"])) &
    (df["filing_date"] >= "2024-01-01") &
    (df["filing_date"] <= "2025-12-31")
].copy()

# -------------------------
# Manually add Dec 31, 2025 10-K (if missing)
# -------------------------
manual_entry = {
    "form": "10-K",
    "filing_date": pd.to_datetime("2025-12-31"),
    "accession_number": "0001628280-26-003952",
    "primary_document": "tsla-20251231.htm"
}

# Check if it's already in dataframe
if not ((df_filtered["filing_date"] == manual_entry["filing_date"]) &
        (df_filtered["form"] == "10-K")).any():
    df_filtered = pd.concat([df_filtered, pd.DataFrame([manual_entry])], ignore_index=True)

# -------------------------
# Construct document URLs
# -------------------------
def build_sec_url(accession, doc):
    accession_nodash = accession.replace("-", "")
    return f"https://www.sec.gov/Archives/edgar/data/1318605/{accession_nodash}/{doc}"

df_filtered["document_url"] = df_filtered.apply(
    lambda row: build_sec_url(row["accession_number"], row["primary_document"]), axis=1
)

# -------------------------
# Save CSV of URLs
# -------------------------
df_filtered.to_csv("data_storage/tesla_sec_filings_urls_2024_2025.csv", index=False)
print("CSV with URLs saved!")


CSV with URLs saved!
