# 1. Data Exploration Notebook

This notebook is for exploring the raw and processed financial datasets. The goal is to understand the data's structure, identify any quality issues, and visualize key features.

## 1.1 Setup

Import necessary libraries and set up the environment.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set plot style
sns.set(style="whitegrid")

# Define project root and data paths
project_root = Path('.').resolve().parent
raw_data_path = project_root / 'data' / 'raw' / 'your_raw_data.csv' # <-- IMPORTANT: Change this to your actual raw data file
processed_data_path = project_root / 'data' / 'processed' / 'feature_rich_data.csv' # <-- This should be the output of the ml_pipeline

## 1.2 Load Raw Data

Load the original, untouched data file.

In [None]:
if raw_data_path.exists():
    df_raw = pd.read_csv(raw_data_path)
    print("Raw data loaded successfully.")
    df_raw.head()
else:
    print(f"Raw data file not found at: {raw_data_path}
          f"Please run the data collection scripts or place your data file there.")

## 1.3 Load Processed & Feature-Rich Data

Load the data after it has been cleaned and had features added by the `ml_pipeline` scripts.

In [None]:
if processed_data_path.exists():
    df = pd.read_csv(processed_data_path, index_col='date', parse_dates=True)
    print("Processed data loaded successfully.")
    df.head()
else:
    print(f"Processed data file not found at: {processed_data_path}
          f"Please run the ml_pipeline/data_processor.py and ml_pipeline/feature_engineering.py scripts first.")

## 1.4 Data Overview

In [None]:
if 'df' in locals():
    print("Data Information:")
    df.info()
    print("
Data Description:")
    print(df.describe())

## 1.5 Visualizations

### Close Price Over Time

In [None]:
if 'df' in locals():
    plt.figure(figsize=(15, 7))
    df['close'].plot()
    plt.title('Close Price Over Time')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.show()

### Volume Over Time

In [None]:
if 'df' in locals():
    plt.figure(figsize=(15, 7))
    df['volume'].plot()
    plt.title('Trading Volume Over Time')
    plt.xlabel('Date')
    plt.ylabel('Volume')
    plt.show()

### Feature Distribution (RSI)

In [None]:
if 'df' in locals() and 'rsi' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.histplot(df['rsi'], bins=50, kde=True)
    plt.title('RSI Distribution')
    plt.axvline(70, color='r', linestyle='--', label='Overbought (70)')
    plt.axvline(30, color='g', linestyle='--', label='Oversold (30)')
    plt.legend()
    plt.show()

### Correlation Heatmap

In [None]:
if 'df' in locals():
    plt.figure(figsize=(12, 10))
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.show()