# Market Predictor: Data Collection

This notebook demonstrates the data collection process from various sources:
1. Market Data (price and volume)
2. News Data (financial news and sentiment)
3. Social Media Data (Reddit and Twitter)
4. Macroeconomic Data (economic indicators)

## Setup and Configuration

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Add project root to path
sys.path.append('..')

# Import project modules
from src.data import DataLoader
from src.integrations import create_data_clients
from src.utils import setup_project_logger
from config import Config, load_validated_config

# Plotting settings
plt.style.use('seaborn')
%matplotlib inline
sns.set_theme(style="whitegrid")

## Load Configuration

Load and validate the project configuration from `config/parameters.yaml`

In [None]:
# Load configuration
config = load_validated_config('config/parameters.yaml')

# Setup logging
logger = setup_project_logger('data_collection')
logger.info('Starting data collection process')

## 1. Market Data Collection

Collect market data (OHLCV) for S&P 500

In [None]:
# Initialize data loader
data_loader = DataLoader(config)

# Get market data
market_data = data_loader.get_market_data()

# Display basic information
print("Market Data Info:")
print(market_data.info())

# Plot price and volume
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

# Price plot
market_data['Close'].plot(ax=ax1, title='S&P 500 Price')
ax1.set_ylabel('Price')

# Volume plot
market_data['Volume'].plot(ax=ax2, title='Trading Volume')
ax2.set_ylabel('Volume')

plt.tight_layout()

## 2. News Data Collection

Collect and analyze financial news from multiple sources

In [None]:
# Initialize data clients
clients = create_data_clients(config)

# Get news data from different sources
news_data = {}

if 'newsapi' in clients:
    news_data['newsapi'] = clients['newsapi'].get_market_news()
    
if 'alphavantage' in clients:
    news_data['alphavantage'] = clients['alphavantage'].get_market_news()
    
if 'finnhub' in clients:
    news_data['finnhub'] = clients['finnhub'].get_market_news()

# Display news statistics
for source, data in news_data.items():
    print(f"\n{source.upper()} News Statistics:")
    print(f"Total articles: {len(data)}")
    if 'sentiment_score' in data.columns:
        print(f"Average sentiment: {data['sentiment_score'].mean():.3f}")

## 3. Social Media Data Collection

Collect and analyze social media sentiment

In [None]:
# Get social media data
social_data = {}

if 'reddit' in clients:
    # Get Reddit sentiment
    social_data['reddit'] = clients['reddit'].get_subreddit_sentiment('wallstreetbets')
    
if 'twitter' in clients:
    # Get Twitter sentiment
    social_data['twitter'] = clients['twitter'].search_cashtag('SPY')

# Plot sentiment distribution
plt.figure(figsize=(12, 6))

for source, data in social_data.items():
    if 'sentiment_score' in data.columns:
        sns.histplot(data=data, x='sentiment_score', label=source, alpha=0.5)

plt.title('Sentiment Distribution by Source')
plt.xlabel('Sentiment Score')
plt.ylabel('Count')
plt.legend()
plt.show()

## 4. Macroeconomic Data Collection

Collect and analyze macroeconomic indicators

In [None]:
# Get macro data
macro_data = data_loader.get_macro_data()

# Display macro data info
print("Macroeconomic Data Info:")
print(macro_data.info())

# Plot key indicators
key_indicators = ['GDP', 'UNRATE', 'CPI', 'FEDFUNDS']
fig, axes = plt.subplots(len(key_indicators), 1, figsize=(15, 15))

for i, indicator in enumerate(key_indicators):
    if indicator in macro_data.columns:
        macro_data[indicator].plot(ax=axes[i], title=f'{indicator} Over Time')
        axes[i].set_ylabel(indicator)

plt.tight_layout()

## 5. Data Quality Analysis

Analyze the quality of collected data

In [None]:
from src.utils import DataQualityMetrics

# Check for missing values
print("\nMissing Values Analysis:")
print("\nMarket Data:")
print(DataQualityMetrics.calculate_missing_percentages(market_data))

print("\nMacro Data:")
print(DataQualityMetrics.calculate_missing_percentages(macro_data))

# Check data staleness
print("\nData Staleness:")
staleness = DataQualityMetrics.calculate_data_staleness(market_data, market_data.index.name)
print(f"Market data staleness: {staleness}")

## 6. Save Processed Data

Save the collected and processed data for feature engineering

In [None]:
# Create data directory if it doesn't exist
import os
os.makedirs('data/processed', exist_ok=True)

# Save processed data
market_data.to_parquet('data/processed/market_data.parquet')
macro_data.to_parquet('data/processed/macro_data.parquet')

# Save news and social data
for source, data in news_data.items():
    data.to_parquet(f'data/processed/news_{source}.parquet')

for source, data in social_data.items():
    data.to_parquet(f'data/processed/social_{source}.parquet')

logger.info('Data collection and processing completed')

## Next Steps

1. Proceed to `02_feature_engineering.ipynb` for feature generation
2. Document any data quality issues encountered
3. Consider additional data sources if needed