# Stock Correlation Research
This notebook fetches stock data from Polygon API and calculates correlations between stock pairs.

In [None]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from itertools import combinations
import os
from typing import List, Tuple, Dict

In [None]:
# Configuration
POLYGON_API_KEY = os.getenv('POLYGON_API_KEY', 'YOUR_API_KEY_HERE')

# List of stock symbols to analyze
SYMBOLS = [
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META',
    'TSLA', 'NVDA', 'JPM', 'V', 'WMT',
    'JNJ', 'PG', 'MA', 'UNH', 'HD',
    'BAC', 'DIS', 'ADBE', 'CRM', 'NFLX'
]

# Date range for data
END_DATE = datetime.now()
START_DATE = END_DATE - timedelta(days=365)  # 1 year of data

# Timespan for aggregation
TIMESPAN = 'day'
MULTIPLIER = 1

In [None]:
def fetch_stock_data(symbol: str, start_date: datetime, end_date: datetime) -> pd.DataFrame:
    """
    Fetch stock data from Polygon API
    
    Args:
        symbol: Stock ticker symbol
        start_date: Start date for data
        end_date: End date for data
    
    Returns:
        DataFrame with stock price data
    """
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/{MULTIPLIER}/{TIMESPAN}/{start_date.strftime('%Y-%m-%d')}/{end_date.strftime('%Y-%m-%d')}"
    
    params = {
        'apiKey': POLYGON_API_KEY,
        'adjusted': 'true',
        'sort': 'asc'
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if 'results' not in data or not data['results']:
            print(f"No data returned for {symbol}")
            return pd.DataFrame()
        
        df = pd.DataFrame(data['results'])
        df['date'] = pd.to_datetime(df['t'], unit='ms')
        df = df.set_index('date')
        df = df[['c']].rename(columns={'c': symbol})
        
        return df
    
    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        return pd.DataFrame()

In [None]:
def fetch_all_stocks(symbols: List[str]) -> pd.DataFrame:
    """
    Fetch data for all stocks and combine into a single DataFrame
    
    Args:
        symbols: List of stock ticker symbols
    
    Returns:
        DataFrame with all stock prices
    """
    all_data = []
    
    for symbol in symbols:
        print(f"Fetching data for {symbol}...")
        df = fetch_stock_data(symbol, START_DATE, END_DATE)
        if not df.empty:
            all_data.append(df)
    
    if not all_data:
        return pd.DataFrame()
    
    # Combine all dataframes
    combined_df = pd.concat(all_data, axis=1)
    
    # Forward fill missing values (for holidays/weekends)
    combined_df = combined_df.fillna(method='ffill')
    
    # Drop any remaining NaN values
    combined_df = combined_df.dropna()
    
    return combined_df

In [None]:
def calculate_returns(price_df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate daily percentage returns
    
    Args:
        price_df: DataFrame with stock prices
    
    Returns:
        DataFrame with daily returns
    """
    return price_df.pct_change().dropna()

In [None]:
def find_correlations(returns_df: pd.DataFrame) -> List[Tuple[str, str, float]]:
    """
    Calculate correlations between all stock pairs
    
    Args:
        returns_df: DataFrame with stock returns
    
    Returns:
        List of tuples (stock1, stock2, correlation) sorted by correlation
    """
    correlations = []
    symbols = returns_df.columns.tolist()
    
    # Calculate correlation for each pair
    for stock1, stock2 in combinations(symbols, 2):
        corr = returns_df[stock1].corr(returns_df[stock2])
        correlations.append((stock1, stock2, corr))
    
    # Sort by absolute correlation (highest first)
    correlations.sort(key=lambda x: abs(x[2]), reverse=True)
    
    return correlations

In [None]:
# Fetch stock data
print("Fetching stock data from Polygon API...\n")
price_data = fetch_all_stocks(SYMBOLS)

if price_data.empty:
    print("No data retrieved. Please check your API key and symbols.")
else:
    print(f"\nSuccessfully fetched data for {len(price_data.columns)} stocks")
    print(f"Date range: {price_data.index.min()} to {price_data.index.max()}")
    print(f"Total data points: {len(price_data)} days")

In [None]:
# Calculate returns
returns_data = calculate_returns(price_data)
print(f"Calculated daily returns for {len(returns_data)} trading days")

In [None]:
# Calculate correlations
print("\nCalculating correlations between all stock pairs...\n")
correlations = find_correlations(returns_data)

print(f"Total pairs analyzed: {len(correlations)}\n")

In [None]:
# Display top 20 highest correlations
print("\n" + "="*60)
print("TOP 20 HIGHEST CORRELATIONS (by absolute value)")
print("="*60)
print(f"{'Stock 1':<10} {'Stock 2':<10} {'Correlation':>15}")
print("-"*60)

for stock1, stock2, corr in correlations[:20]:
    print(f"{stock1:<10} {stock2:<10} {corr:>15.4f}")

In [None]:
# Create a DataFrame with all correlations for further analysis
correlation_df = pd.DataFrame(correlations, columns=['Stock 1', 'Stock 2', 'Correlation'])
correlation_df['Abs_Correlation'] = correlation_df['Correlation'].abs()

# Display summary statistics
print("\n" + "="*60)
print("CORRELATION SUMMARY STATISTICS")
print("="*60)
print(f"Mean correlation: {correlation_df['Correlation'].mean():.4f}")
print(f"Median correlation: {correlation_df['Correlation'].median():.4f}")
print(f"Max correlation: {correlation_df['Correlation'].max():.4f}")
print(f"Min correlation: {correlation_df['Correlation'].min():.4f}")
print(f"Std deviation: {correlation_df['Correlation'].std():.4f}")

correlation_df.head(10)

In [None]:
# Optional: Visualize correlation matrix
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(14, 12))
correlation_matrix = returns_data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Stock Returns Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Display bottom 10 (most negative correlations)
print("\n" + "="*60)
print("TOP 10 MOST NEGATIVE CORRELATIONS")
print("="*60)
print(f"{'Stock 1':<10} {'Stock 2':<10} {'Correlation':>15}")
print("-"*60)

sorted_by_value = sorted(correlations, key=lambda x: x[2])
for stock1, stock2, corr in sorted_by_value[:10]:
    print(f"{stock1:<10} {stock2:<10} {corr:>15.4f}")

In [None]:
# Save results to CSV
output_file = 'stock_correlations.csv'
correlation_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")