# Data Collection - Stock Prices & News Headlines

This notebook covers:
1. Downloading historical stock price data from Yahoo Finance
2. Collecting news headlines for sentiment analysis
3. Data preprocessing and storage


## 1. Setup and Imports


In [6]:
import yfinance as yf
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import datetime
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Create data directory if it doesn't exist
Path('../data').mkdir(exist_ok=True)

print("Setup complete!")


Setup complete!


## 2. Download Stock Price Data

In [7]:
# Define stock tickers - starting with major tech stocks
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'NVDA', 'TSLA', 'NFLX', 'CRM', 'ADBE']

# Date range for historical data
start_date = "2022-01-01"
end_date = "2024-12-31"

print(f"Downloading price data for {len(tickers)} stocks from {start_date} to {end_date}...")


Downloading price data for 10 stocks from 2022-01-01 to 2024-12-31...


In [8]:
# Download all data at once
data = yf.download(tickers, start=start_date, end=end_date, group_by='ticker')

if data is not None and not data.empty:
    print(f"Downloaded data shape: {data.shape}")
    print(f"Date range: {data.index[0]} to {data.index[-1]}")
    print(f"Available data for {len(tickers)} stocks")
else:
    print("❌ Failed to download data. Check your internet connection and try again.")
    print("Note: Yahoo Finance may sometimes block requests or have temporary issues.")


[*********************100%***********************]  10 of 10 completed

Downloaded data shape: (752, 50)
Date range: 2022-01-03 00:00:00 to 2024-12-30 00:00:00
Available data for 10 stocks





In [None]:
# Check if data download was successful
if data is not None and not data.empty:
    
    # Debug: Check data structure first
    print("=== DATA STRUCTURE DEBUG ===")
    print(f"Data shape: {data.shape}")
    print(f"Column names: {data.columns.names}")
    print(f"First few columns: {list(data.columns[:10])}")
    print(f"Sample ticker data columns: {list(data[tickers[0]].columns)}")
    print("Sample data:")
    print(data[tickers[0]].head(3))
    print("=" * 50)
    # Save individual stock data with technical indicators
    for ticker in tickers:
        try:
            # Extract data for this ticker
            df = data[ticker].copy()
            
            # Add basic technical indicators (using Close since Adj Close not available)
            df['Returns'] = df['Close'].pct_change()
            df['Volume_MA_20'] = df['Volume'].rolling(20).mean()
            df['Price_MA_20'] = df['Close'].rolling(20).mean()
            df['Price_MA_5'] = df['Close'].rolling(5).mean()
            df['Volatility_20'] = df['Returns'].rolling(20).std()
            
            # Simple RSI calculation
            delta = df['Close'].diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
            rs = gain / loss
            df['RSI'] = 100 - (100 / (1 + rs))
            
            # Save to CSV
            df.to_csv(f"../data/{ticker}_price.csv")
            print(f"Saved {ticker}: {len(df)} rows")
            
        except Exception as e:
            print(f"Error processing {ticker}: {e}")
            continue
    
    print("\n✅ Price data collection complete!")
else:
    print("❌ Cannot process data - download failed or data is empty")


=== DATA STRUCTURE DEBUG ===
Data shape: (752, 50)
Column names: ['Ticker', 'Price']
First few columns: [('CRM', 'Open'), ('CRM', 'High'), ('CRM', 'Low'), ('CRM', 'Close'), ('CRM', 'Volume'), ('META', 'Open'), ('META', 'High'), ('META', 'Low'), ('META', 'Close'), ('META', 'Volume')]
Sample ticker data columns: ['Open', 'High', 'Low', 'Close', 'Volume']
Sample data:
Price             Open        High         Low       Close     Volume
Date                                                                 
2022-01-03  174.542917  179.499574  174.425140  178.645645  104487900
2022-01-04  179.254190  179.558457  175.809061  176.378342   99310400
2022-01-05  176.290048  176.839695  171.411914  171.686737   94537600
Error processing AAPL: 'Adj Close'
Error processing MSFT: 'Adj Close'
Error processing GOOGL: 'Adj Close'
Error processing AMZN: 'Adj Close'
Error processing META: 'Adj Close'
Error processing NVDA: 'Adj Close'
Error processing TSLA: 'Adj Close'
Error processing NFLX: 'Adj Close'
E

## 3. News Headlines Collection


In [10]:
def get_yahoo_headlines(ticker, max_headlines=10):
    """
    Scrape recent headlines for a given stock ticker from Yahoo Finance
    Note: This is a basic example - in production, you'd want to use proper APIs
    """
    try:
        url = f"https://finance.yahoo.com/quote/{ticker}?p={ticker}"
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find headline elements (this may need adjustment based on Yahoo's current structure)
        headlines = []
        for h in soup.find_all(['h3', 'h4'], limit=max_headlines * 2):
            text = h.text.strip()
            if text and len(text) > 10 and len(text) < 200:
                headlines.append(text)
                if len(headlines) >= max_headlines:
                    break
        
        return headlines
    except Exception as e:
        print(f"Error getting headlines for {ticker}: {e}")
        return []

# Test the function
sample_headlines = get_yahoo_headlines('AAPL', max_headlines=3)
print(f"Sample headlines for AAPL ({len(sample_headlines)} found):")
for i, headline in enumerate(sample_headlines):
    print(f"{i+1}. {headline}")


Sample headlines for AAPL (2 found):
1. Entertainment
2. New on Yahoo


In [11]:
# Collect headlines for all tickers
import time

all_headlines = []

for ticker in tickers:
    print(f"Collecting headlines for {ticker}...")
    headlines = get_yahoo_headlines(ticker, max_headlines=5)
    
    for headline in headlines:
        all_headlines.append({
            'ticker': ticker,
            'headline': headline,
            'date': datetime.datetime.now().strftime('%Y-%m-%d'),
            'timestamp': datetime.datetime.now().isoformat(),
            'source': 'yahoo_finance'
        })
    
    # Be nice to the server
    time.sleep(1)

# Create DataFrame and save
headlines_df = pd.DataFrame(all_headlines)
headlines_df.to_csv('../data/headlines_sample.csv', index=False)

print(f"\n✅ Collected {len(headlines_df)} headlines total")
print(f"Headlines per stock: {headlines_df['ticker'].value_counts().to_dict()}")
print(f"Saved to: ../data/headlines_sample.csv")


Collecting headlines for AAPL...
Collecting headlines for MSFT...
Collecting headlines for GOOGL...
Collecting headlines for AMZN...
Collecting headlines for META...
Collecting headlines for NVDA...
Collecting headlines for TSLA...
Collecting headlines for NFLX...
Collecting headlines for CRM...
Collecting headlines for ADBE...

✅ Collected 23 headlines total
Headlines per stock: {'TSLA': 5, 'AAPL': 2, 'MSFT': 2, 'GOOGL': 2, 'AMZN': 2, 'META': 2, 'NVDA': 2, 'NFLX': 2, 'CRM': 2, 'ADBE': 2}
Saved to: ../data/headlines_sample.csv


In [12]:
# Display sample of collected headlines
print("=== SAMPLE HEADLINES ===")
print(headlines_df.head(10))


=== SAMPLE HEADLINES ===
  ticker       headline        date                   timestamp         source
0   AAPL  Entertainment  2025-06-29  2025-06-29T23:09:56.466219  yahoo_finance
1   AAPL   New on Yahoo  2025-06-29  2025-06-29T23:09:56.466226  yahoo_finance
2   MSFT  Entertainment  2025-06-29  2025-06-29T23:09:58.181409  yahoo_finance
3   MSFT   New on Yahoo  2025-06-29  2025-06-29T23:09:58.181414  yahoo_finance
4  GOOGL  Entertainment  2025-06-29  2025-06-29T23:09:59.858205  yahoo_finance
5  GOOGL   New on Yahoo  2025-06-29  2025-06-29T23:09:59.858210  yahoo_finance
6   AMZN  Entertainment  2025-06-29  2025-06-29T23:10:01.586185  yahoo_finance
7   AMZN   New on Yahoo  2025-06-29  2025-06-29T23:10:01.586190  yahoo_finance
8   META  Entertainment  2025-06-29  2025-06-29T23:10:03.289127  yahoo_finance
9   META   New on Yahoo  2025-06-29  2025-06-29T23:10:03.289132  yahoo_finance
