# Stock Market Volatility Prediction using Sentiment Analysis

Overview:
This script predicts next-day stock volatility using:
- Historical OHLCV stock data (via yfinance)
- News headlines with sentiment analysis (FinBERT)
- Technical indicators (RSI, MACD, EMA)
- Time-series models (ARIMA, LSTM)

In [14]:
# Core data manipulation and analysis
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Date and time handling
from datetime import datetime, timedelta
import yfinance as yf

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set display options for better data viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# File handling and web requests
import requests
import os
from pathlib import Path

In [15]:

# Stock selection 
STOCK_TICKER = "AAPL" 
PERIOD = "2y" 
INTERVAL = "1d"  


NEWS_API_KEY = "8f160b733e194588a433aa97baafa4fa"  
COMPANY_NAME = "Apple" 

DATA_DIR = Path("data")
RAW_DATA_DIR = DATA_DIR / "raw"

# Create directories
for dir_path in [DATA_DIR, RAW_DATA_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

In [None]:

print("COLLECTING STOCK PRICE DATA")
print("="*50)

def collect_stock_data(ticker, period="2y", interval="1d"):
    """
    Collect stock price data using yfinance
    
    Parameters:
    -----------
    ticker : str
        Stock ticker symbol (e.g., 'AAPL', 'TSLA')
    period : str
        Data period ('1y', '2y', '5y', etc.)
    interval : str
        Data interval ('1d', '1h', etc.)
    
    Returns:
    --------
    pandas.DataFrame
        Stock price data with OHLCV columns
    """
    try:
        print(f"🔄 Fetching {ticker} data for period: {period}")
        
        stock = yf.Ticker(ticker)
        df = stock.history(period=period, interval=interval)
        
        df = df.reset_index()
        
        df.columns = [col.replace(' ', '_').lower() for col in df.columns]
        
        df['ticker'] = ticker
        
        print(f"Successfully collected {len(df)} days of data")
        print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
        
        return df
        
    except Exception as e:
        print(f" Error collecting data for {ticker}: {str(e)}")
        return None

# Collect the stock data
df_stock = collect_stock_data(STOCK_TICKER, PERIOD, INTERVAL)



COLLECTING STOCK PRICE DATA
🔄 Fetching AAPL data for period: 2y
Successfully collected 502 days of data
Date range: 2023-06-06 to 2025-06-05


In [17]:
# Display basic information about the collected data
if df_stock is not None:
    print(f"\nStock Data Shape: {df_stock.shape}")
    print(f"Columns: {list(df_stock.columns)}")
    
    # Display first few rows
    print("\nFirst 5 rows of stock data:")
    print(df_stock.head())
    
    # Display last few rows
    print("\nLast 5 rows of stock data:")
    print(df_stock.tail())
    
    # Basic statistics
    print("\n Stock Price Statistics:")
    print(df_stock[['open', 'high', 'low', 'close', 'volume']].describe())


Stock Data Shape: (502, 9)
Columns: ['date', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock_splits', 'ticker']

First 5 rows of stock data:
                       date        open        high         low       close    volume  dividends  stock_splits ticker
0 2023-06-06 00:00:00-04:00  178.185932  178.334439  175.671103  177.433472  64848400        0.0           0.0   AAPL
1 2023-06-07 00:00:00-04:00  176.671115  179.413660  175.562223  176.057266  61944600        0.0           0.0   AAPL
2 2023-06-08 00:00:00-04:00  176.136468  179.047327  175.700843  178.780014  50214900        0.0           0.0   AAPL
3 2023-06-09 00:00:00-04:00  179.700778  180.423537  178.839407  179.166138  48870700        0.0           0.0   AAPL
4 2023-06-12 00:00:00-04:00  179.473070  182.067093  179.176041  181.968079  54274900        0.0           0.0   AAPL

Last 5 rows of stock data:
                         date        open        high         low       close    volume  dividends  stock_sp

News Headline Data Collection

In [31]:
print("\n" + "="*50)
print("COLLECTING NEWS HEADLINES FROM NEWSAPI")
print("="*50)

import requests
from datetime import timedelta
import time

def collect_news_data(api_key, start_date, end_date, query="AAPL", sleep_time=1):
    """
    Collect daily financial news headlines using NewsAPI

    Parameters:
    -----------
    api_key : str
        Your NewsAPI key
    start_date : datetime.date
        Start of the date range
    end_date : datetime.date
        End of the date range
    query : str
        Search query (company name, stock symbol, etc.)
    sleep_time : int
        Time delay between API calls (seconds)

    Returns:
    --------
    pandas.DataFrame
        DataFrame with Date and all_headlines columns
    """
    print(f"🔄 Fetching headlines from {start_date} to {end_date} for query: {query}")

    url = "https://newsapi.org/v2/everything"
    headers = {"Authorization": api_key}

    results = []

    current_date = start_date
    while current_date <= end_date:
        from_str = current_date.strftime('%Y-%m-%d')
        to_str = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')

        params = {
            "q": query,
            "from": from_str,
            "to": to_str,
            "language": "en",
            "sortBy": "relevancy",
            "pageSize": 100,
            "apiKey": api_key
        }

        try:
            response = requests.get(url, params=params)
            data = response.json()
            if "articles" in data:
                headlines = [
                    a.get("title", "") + ". " + (a.get("description") or "")
                    for a in data["articles"]
                ]
                full_text = " ".join(headlines)
                results.append({"date": current_date, "all_headlines": full_text})
                print(f"✅ {from_str} | {len(headlines)} articles")
            else:
                print(f"⚠️ {from_str} | No articles found or error: {data.get('message')}")
                results.append({"date": current_date, "all_headlines": ""})

        except Exception as e:
            print(f"❌ Error on {from_str}: {e}")
            results.append({"date": current_date, "all_headlines": ""})

        time.sleep(sleep_time)
        current_date += timedelta(days=1)

    df_news = pd.DataFrame(results)
    return df_news



COLLECTING NEWS HEADLINES FROM NEWSAPI


In [34]:
STOCK_TICKER = "AAPL"
PERIOD = "2y"
INTERVAL = "1d"
NEWSAPI_KEY = "8f160b733e194588a433aa97baafa4fa"  # Replace with your real key
NEWS_QUERY = "Apple OR AAPL"

# Step 1: Get stock data
df_stock = collect_stock_data(STOCK_TICKER, PERIOD, INTERVAL)

# Step 2: Get safe date range for NewsAPI
today = datetime.today().date()
newsapi_earliest = today - timedelta(days=30)

start_date = max(df_stock['date'].min().date(), newsapi_earliest)
end_date = df_stock['date'].max().date()

print(f"\n📅 NewsAPI Date Range Adjusted to: {start_date} → {end_date} (limited to past 30 days)")

# Step 3: Get news headlines
df_news = collect_news_data(NEWSAPI_KEY, start_date, end_date, query=NEWS_QUERY)

🔄 Fetching AAPL data for period: 2y
Successfully collected 502 days of data
Date range: 2023-06-06 to 2025-06-05

📅 NewsAPI Date Range Adjusted to: 2025-05-07 → 2025-06-05 (limited to past 30 days)
🔄 Fetching headlines from 2025-05-07 to 2025-06-05 for query: Apple OR AAPL
✅ 2025-05-07 | 97 articles
✅ 2025-05-08 | 100 articles
✅ 2025-05-09 | 100 articles
✅ 2025-05-10 | 100 articles
✅ 2025-05-11 | 100 articles
✅ 2025-05-12 | 100 articles
✅ 2025-05-13 | 100 articles
✅ 2025-05-14 | 100 articles
✅ 2025-05-15 | 100 articles
✅ 2025-05-16 | 100 articles
✅ 2025-05-17 | 100 articles
✅ 2025-05-18 | 100 articles
✅ 2025-05-19 | 100 articles
✅ 2025-05-20 | 100 articles
✅ 2025-05-21 | 100 articles
✅ 2025-05-22 | 100 articles
✅ 2025-05-23 | 100 articles
✅ 2025-05-24 | 100 articles
✅ 2025-05-25 | 100 articles
✅ 2025-05-26 | 100 articles
✅ 2025-05-27 | 100 articles
✅ 2025-05-28 | 100 articles
✅ 2025-05-29 | 100 articles
✅ 2025-05-30 | 100 articles
✅ 2025-05-31 | 100 articles
✅ 2025-06-01 | 100 articles

In [36]:
print("\n📰 Sample news data:")
print(df_news.head())


📰 Sample news data:
         date                                      all_headlines
0  2025-05-07  Apple has a new ‘Viral’ playlist on Apple Musi...
1  2025-05-08  Apple has a new ‘Viral’ playlist on Apple Musi...
2  2025-05-09  How to turn on Lockdown Mode for your iPhone a...
3  2025-05-10  Apple may release a ‘mostly glass, curved iPho...
4  2025-05-11  Apple may release a ‘mostly glass, curved iPho...


In [None]:

print("COLLECTING STOCK PRICE DATA")
print("="*50)

def collect_stock_data(ticker, period="2y", interval="1d"):
    """
    Collect stock price data using yfinance
    
    Parameters:
    -----------
    ticker : str
        Stock ticker symbol (e.g., 'AAPL', 'TSLA')
    period : str
        Data period ('1y', '2y', '5y', etc.)
    interval : str
        Data interval ('1d', '1h', etc.)
    
    Returns:
    --------
    pandas.DataFrame
        Stock price data with OHLCV columns
    """
    try:
        print(f"🔄 Fetching {ticker} data for period: {period}")
        
        stock = yf.Ticker(ticker)
        df = stock.history(period=period, interval=interval)
        
        df = df.reset_index()
        
        df.columns = [col.replace(' ', '_').lower() for col in df.columns]
        
        df['ticker'] = ticker
        
        print(f"Successfully collected {len(df)} days of data")
        print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
        
        return df
        
    except Exception as e:
        print(f" Error collecting data for {ticker}: {str(e)}")
        return None

# Collect the stock data
df_stock = collect_stock_data(STOCK_TICKER, PERIOD, INTERVAL)



COLLECTING STOCK PRICE DATA
🔄 Fetching AAPL data for period: 2y
Successfully collected 502 days of data
Date range: 2023-06-06 to 2025-06-05


In [None]:

print("COLLECTING STOCK PRICE DATA")
print("="*50)

def collect_stock_data(ticker, period="2y", interval="1d"):
    """
    Collect stock price data using yfinance
    
    Parameters:
    -----------
    ticker : str
        Stock ticker symbol (e.g., 'AAPL', 'TSLA')
    period : str
        Data period ('1y', '2y', '5y', etc.)
    interval : str
        Data interval ('1d', '1h', etc.)
    
    Returns:
    --------
    pandas.DataFrame
        Stock price data with OHLCV columns
    """
    try:
        print(f"🔄 Fetching {ticker} data for period: {period}")
        
        stock = yf.Ticker(ticker)
        df = stock.history(period=period, interval=interval)
        
        df = df.reset_index()
        
        df.columns = [col.replace(' ', '_').lower() for col in df.columns]
        
        df['ticker'] = ticker
        
        print(f"Successfully collected {len(df)} days of data")
        print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
        
        return df
        
    except Exception as e:
        print(f" Error collecting data for {ticker}: {str(e)}")
        return None

# Collect the stock data
df_stock = collect_stock_data(STOCK_TICKER, PERIOD, INTERVAL)



COLLECTING STOCK PRICE DATA
🔄 Fetching AAPL data for period: 2y
Successfully collected 502 days of data
Date range: 2023-06-06 to 2025-06-05
