# AttentionTrader Data Analysis

Basic analysis of the financial time series dataset collected from Yahoo Finance.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Set up visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Data directory
data_dir = Path('data/yfinance')

## 1. Dataset Overview

In [None]:
# Get all CSV files
csv_files = list(data_dir.glob('*.csv'))
print(f"Total number of files: {len(csv_files)}")
print(f"\nFiles in dataset:")
for i, file in enumerate(sorted(csv_files), 1):
    print(f"{i:2d}. {file.stem}")

## 2. Load All Data and Analyze Coverage

In [None]:
# Load all datasets and collect metadata
datasets = {}
metadata = []

for file in csv_files:
    try:
        # Read CSV with header rows (yfinance creates multi-level headers)
        df = pd.read_csv(file, header=[0, 1, 2])
        
        # Flatten the multi-level columns and use Date as index
        df.columns = df.columns.get_level_values(0)
        df = df.set_index('Price')
        df.index.name = 'Date'
        
        # Convert index to datetime
        df.index = pd.to_datetime(df.index)
        
        # Remove any rows with invalid dates
        df = df[df.index.notna()]
        
        ticker_name = file.stem
        datasets[ticker_name] = df
        
        if len(df) > 0:
            metadata.append({
                'Ticker': ticker_name,
                'Rows': len(df),
                'Start Date': df.index.min(),
                'End Date': df.index.max(),
                'Days': (df.index.max() - df.index.min()).days,
                'Columns': ', '.join(df.columns)
            })
    except Exception as e:
        print(f"Error loading {file.stem}: {e}")

# Create metadata DataFrame
metadata_df = pd.DataFrame(metadata)
metadata_df = metadata_df.sort_values('Days', ascending=False)

print(f"Successfully loaded {len(datasets)} datasets")
print(f"\nDataset Statistics:")
print(f"Total rows across all files: {metadata_df['Rows'].sum():,}")
print(f"Average rows per file: {metadata_df['Rows'].mean():.0f}")
print(f"Median rows per file: {metadata_df['Rows'].median():.0f}")

In [None]:
# Display detailed metadata table
print("\nDetailed Dataset Information:")
print("=" * 100)
metadata_df.style.format({
    'Rows': '{:,}',
    'Days': '{:,}'
})

## 3. Temporal Coverage Analysis

In [None]:
# Temporal statistics
print("Temporal Coverage Statistics:")
print(f"\nEarliest data point: {metadata_df['Start Date'].min()}")
print(f"Latest data point: {metadata_df['End Date'].max()}")
print(f"\nLongest history: {metadata_df.iloc[0]['Ticker']} ({metadata_df.iloc[0]['Days']:,} days)")
print(f"Shortest history: {metadata_df.iloc[-1]['Ticker']} ({metadata_df.iloc[-1]['Days']:,} days)")
print(f"\nAverage coverage: {metadata_df['Days'].mean():.0f} days ({metadata_df['Days'].mean()/365:.1f} years)")
print(f"Median coverage: {metadata_df['Days'].median():.0f} days ({metadata_df['Days'].median()/365:.1f} years)")

## 4. Visualize Data Coverage

In [None]:
# Plot timeline of data coverage
fig, ax = plt.subplots(figsize=(14, 10))

# Sort by start date for better visualization
plot_df = metadata_df.sort_values('Start Date')

for idx, row in plot_df.iterrows():
    ax.barh(row['Ticker'], 
            width=(row['End Date'] - row['Start Date']).days,
            left=row['Start Date'],
            height=0.7,
            alpha=0.7)

ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Ticker', fontsize=12)
ax.set_title('Historical Data Coverage Timeline', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

In [None]:
# Distribution of data points
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of number of rows
axes[0].hist(metadata_df['Rows'], bins=20, alpha=0.7, color='steelblue', edgecolor='black')
axes[0].set_xlabel('Number of Rows', fontsize=11)
axes[0].set_ylabel('Frequency', fontsize=11)
axes[0].set_title('Distribution of Dataset Sizes', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Histogram of time coverage in years
axes[1].hist(metadata_df['Days']/365, bins=20, alpha=0.7, color='coral', edgecolor='black')
axes[1].set_xlabel('Years of Data', fontsize=11)
axes[1].set_ylabel('Frequency', fontsize=11)
axes[1].set_title('Distribution of Temporal Coverage', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Top 10 Datasets by Coverage

In [None]:
# Top 10 by number of days
top_10_days = metadata_df.nlargest(10, 'Days')[['Ticker', 'Days', 'Rows', 'Start Date', 'End Date']].copy()
top_10_days['Years'] = (top_10_days['Days'] / 365).round(1)

print("Top 10 Datasets by Historical Coverage:")
print(top_10_days.to_string(index=False))

In [None]:
# Top 10 by number of rows
top_10_rows = metadata_df.nlargest(10, 'Rows')[['Ticker', 'Rows', 'Days', 'Start Date', 'End Date']].copy()
top_10_rows['Years'] = (top_10_rows['Days'] / 365).round(1)

print("Top 10 Datasets by Number of Data Points:")
print(top_10_rows.to_string(index=False))

## 6. Sample Data Examination

In [None]:
# Examine a sample dataset (Apple)
if 'apple' in datasets:
    apple_df = datasets['apple']
    
    print("Apple (AAPL) Dataset Sample:")
    print(f"\nShape: {apple_df.shape}")
    print(f"\nColumns: {list(apple_df.columns)}")
    print(f"\nFirst 5 rows:")
    display(apple_df.head())
    print(f"\nLast 5 rows:")
    display(apple_df.tail())
    print(f"\nBasic Statistics:")
    display(apple_df.describe())

## 7. Data Quality Check

In [None]:
# Check for missing values across all datasets
missing_data = []

for name, df in datasets.items():
    missing_count = df.isnull().sum().sum()
    total_cells = df.shape[0] * df.shape[1]
    missing_pct = (missing_count / total_cells) * 100 if total_cells > 0 else 0
    
    missing_data.append({
        'Ticker': name,
        'Missing Values': missing_count,
        'Total Cells': total_cells,
        'Missing %': round(missing_pct, 2)
    })

missing_df = pd.DataFrame(missing_data)
missing_df = missing_df.sort_values('Missing %', ascending=False)

print("Data Quality Summary:")
print(f"\nDatasets with missing values: {(missing_df['Missing Values'] > 0).sum()}")
print(f"Total missing values across all datasets: {missing_df['Missing Values'].sum():,}")

if (missing_df['Missing Values'] > 0).any():
    print("\nDatasets with missing values:")
    print(missing_df[missing_df['Missing Values'] > 0].to_string(index=False))
else:
    print("\nâœ“ No missing values found in any dataset!")

## 8. Category Breakdown

In [None]:
# Categorize tickers by type
categories = {
    'Big Tech': ['apple', 'microsoft', 'alphabet', 'nvidia', 'meta', 'amazon'],
    'Finance': ['jpmorgan', 'visa', 'berkshire_hathaway'],
    'Healthcare': ['unitedhealth', 'johnson_and_johnson', 'pfizer'],
    'Consumer': ['tesla', 'mcdonalds', 'walmart', 'coca_cola', 'procter_gamble'],
    'Energy': ['exxonmobil', 'chevron'],
    'Industrials': ['caterpillar', 'union_pacific', 'boeing'],
    'Utilities': ['nextera_energy', 'duke_energy'],
    'Real Estate': ['prologis', 'american_tower'],
    'Materials': ['linde', 'freeport_mcmoran'],
    'Telecom': ['verizon', 'tmobile'],
    'International': ['taiwan_semiconductor', 'asml', 'toyota', 'alibaba'],
    'Crypto': ['bitcoin', 'ethereum', 'solana'],
    'Indices': ['sp500', 'nasdaq100', 'dow_jones', 'russell2000', 'vix_volatility']
}

category_stats = []
for category, tickers in categories.items():
    ticker_data = metadata_df[metadata_df['Ticker'].isin(tickers)]
    category_stats.append({
        'Category': category,
        'Count': len(ticker_data),
        'Avg Days': ticker_data['Days'].mean(),
        'Avg Rows': ticker_data['Rows'].mean(),
        'Total Rows': ticker_data['Rows'].sum()
    })

category_df = pd.DataFrame(category_stats)
category_df = category_df.sort_values('Total Rows', ascending=False)

print("Dataset Breakdown by Category:")
print(category_df.to_string(index=False))

In [None]:
# Visualize category distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Number of tickers per category
category_df.plot(x='Category', y='Count', kind='barh', ax=axes[0], 
                  color='steelblue', legend=False)
axes[0].set_xlabel('Number of Tickers', fontsize=11)
axes[0].set_ylabel('Category', fontsize=11)
axes[0].set_title('Tickers per Category', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# Total data points per category
category_df.plot(x='Category', y='Total Rows', kind='barh', ax=axes[1], 
                  color='coral', legend=False)
axes[1].set_xlabel('Total Data Points', fontsize=11)
axes[1].set_ylabel('Category', fontsize=11)
axes[1].set_title('Total Data Points per Category', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 9. Price and Volume Visualization by Category

Visualize close prices and trading volumes for all tickers within each category.

In [None]:
def plot_category_chart(category_name, ticker_list, datasets):
    """
    Create a dual-axis chart for a category showing:
    - Close prices as line plots
    - Volume as histogram in background
    """
    # Filter available tickers
    available_tickers = [t for t in ticker_list if t in datasets]
    
    if not available_tickers:
        print(f"No data available for {category_name}")
        return
    
    # Create figure with dual y-axes
    fig, ax1 = plt.subplots(figsize=(16, 8))
    ax2 = ax1.twinx()
    
    # Color palette for line plots
    colors = plt.cm.tab10(np.linspace(0, 1, len(available_tickers)))
    
    # Plot volume as histogram in background (on secondary axis)
    for idx, ticker in enumerate(available_tickers):
        df = datasets[ticker]
        if 'Volume' in df.columns:
            # Normalize volume for better visualization across different scales
            volume_normalized = df['Volume'] / df['Volume'].max() if df['Volume'].max() > 0 else df['Volume']
            ax2.fill_between(df.index, 0, volume_normalized, 
                            alpha=0.1, color=colors[idx], 
                            label=f'{ticker.replace("_", " ").title()} Vol')
    
    # Plot close prices on primary axis
    for idx, ticker in enumerate(available_tickers):
        df = datasets[ticker]
        if 'Close' in df.columns:
            ax1.plot(df.index, df['Close'], 
                    label=ticker.replace('_', ' ').title(),
                    linewidth=2, color=colors[idx], alpha=0.8)
    
    # Customize primary axis (prices)
    ax1.set_xlabel('Date', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Close Price (USD)', fontsize=12, fontweight='bold')
    ax1.set_title(f'{category_name} - Historical Close Prices with Volume',
                 fontsize=14, fontweight='bold', pad=20)
    ax1.grid(True, alpha=0.3, linestyle='--')
    ax1.legend(loc='upper left', fontsize=9, framealpha=0.9)
    
    # Customize secondary axis (volume)
    ax2.set_ylabel('Normalized Volume', fontsize=12, fontweight='bold', alpha=0.7)
    ax2.set_ylim(0, 1.2)  # Normalized volume range
    ax2.tick_params(axis='y', labelcolor='gray', labelsize=9)
    
    # Format x-axis
    fig.autofmt_xdate()
    
    plt.tight_layout()
    plt.show()

print("Function defined: plot_category_chart()")

### 9.1 Big Tech

In [None]:
plot_category_chart('Big Tech', categories['Big Tech'], datasets)

### 9.2 Finance

In [None]:
plot_category_chart('Finance', categories['Finance'], datasets)

### 9.3 Healthcare

In [None]:
plot_category_chart('Healthcare', categories['Healthcare'], datasets)

### 9.4 Consumer

In [None]:
plot_category_chart('Consumer', categories['Consumer'], datasets)

### 9.5 Energy

In [None]:
plot_category_chart('Energy', categories['Energy'], datasets)

### 9.6 Industrials

In [None]:
plot_category_chart('Industrials', categories['Industrials'], datasets)

### 9.7 Utilities

In [None]:
plot_category_chart('Utilities', categories['Utilities'], datasets)

### 9.8 Real Estate

In [None]:
plot_category_chart('Real Estate', categories['Real Estate'], datasets)

### 9.9 Materials

In [None]:
plot_category_chart('Materials', categories['Materials'], datasets)

### 9.10 Telecom

In [None]:
plot_category_chart('Telecom', categories['Telecom'], datasets)

### 9.11 International

In [None]:
plot_category_chart('International', categories['International'], datasets)

### 9.12 Crypto

In [None]:
plot_category_chart('Crypto', categories['Crypto'], datasets)

### 9.13 Market Indices

In [None]:
plot_category_chart('Market Indices', categories['Indices'], datasets)

## Summary

This notebook provides a comprehensive overview of the AttentionTrader dataset:
- Dataset coverage and temporal analysis
- Data quality assessment
- Category distribution
- Visual representations of the data structure
- Price and volume charts for all categories

The dataset is ready for further analysis including:
- Price trend analysis
- Volatility studies
- Correlation analysis
- Feature engineering for machine learning models