# Data Exploration - BTC 15m

First notebook to test and explore cryptocurrency OHLCV data.

## Steps:
1. Load data from HuggingFace
2. Clean and validate
3. Basic statistics
4. Visualize OHLC patterns
5. Test windowing

## Setup

In [None]:
!pip install -q huggingface-hub pandas numpy matplotlib seaborn scikit-learn

import sys
from pathlib import Path

# Add repo to path
if 'v2bot' not in sys.path:
    sys.path.insert(0, '/root/v2bot')

## Import and Setup

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('[Setup] All imports successful')

## Step 1: Load Data from HuggingFace

In [None]:
from huggingface_hub import hf_hub_download

# Download BTC 15m data
repo_id = 'zongowo111/v2-crypto-ohlcv-data'
symbol = 'BTC'
interval = '15m'

print(f'[Step 1] Downloading {symbol}_{interval} from {repo_id}...')

file_path = hf_hub_download(
    repo_id=repo_id,
    filename=f'klines/{symbol}/{symbol}_{interval}.csv',
    repo_type='dataset'
)

df = pd.read_csv(file_path)
print(f'[Step 1] SUCCESS: Loaded {len(df)} rows')

## Step 2: Explore Raw Data

In [None]:
print('\n' + '='*70)
print('RAW DATA EXPLORATION')
print('='*70)

print(f'\nShape: {df.shape}')
print(f'\nColumns: {df.columns.tolist()}')
print(f'\nData types:\n{df.dtypes}')
print(f'\nMissing values:\n{df.isnull().sum()}')

print(f'\nFirst 5 rows:')
df.head()

## Step 3: Data Cleaning

In [None]:
# Clean data
df_clean = df.copy()

# Convert timestamp
if 'timestamp' in df_clean.columns:
    df_clean['timestamp'] = pd.to_datetime(df_clean['timestamp'])
    df_clean = df_clean.sort_values('timestamp')

# Remove duplicates
initial_len = len(df_clean)
df_clean = df_clean.drop_duplicates(subset=['timestamp'] if 'timestamp' in df_clean.columns else None)
print(f'[Cleaning] Removed {initial_len - len(df_clean)} duplicate rows')

# Remove missing values
initial_len = len(df_clean)
df_clean = df_clean.dropna(subset=['open', 'high', 'low', 'close', 'volume'])
print(f'[Cleaning] Removed {initial_len - len(df_clean)} rows with missing OHLCV')

# Ensure all values are positive
numeric_cols = ['open', 'high', 'low', 'close', 'volume']
initial_len = len(df_clean)
df_clean = df_clean[df_clean[numeric_cols].gt(0).all(axis=1)]
print(f'[Cleaning] Removed {initial_len - len(df_clean)} rows with zero/negative values')

# Reset index
df_clean = df_clean.reset_index(drop=True)

print(f'\n[Cleaning] Final shape: {df_clean.shape}')
if 'timestamp' in df_clean.columns:
    print(f'[Cleaning] Date range: {df_clean["timestamp"].min()} to {df_clean["timestamp"].max()}')

## Step 4: Statistics

In [None]:
print('\n' + '='*70)
print('STATISTICS')
print('='*70 + '\n')

numeric_cols = ['open', 'high', 'low', 'close', 'volume']
print(df_clean[numeric_cols].describe().round(2))

## Step 5: Visualization

In [None]:
# Plot OHLC
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Price trend
axes[0].plot(df_clean['close'], linewidth=0.8, label='Close')
axes[0].fill_between(range(len(df_clean)), df_clean['low'], df_clean['high'], alpha=0.2)
axes[0].set_title(f'{symbol}_{interval} - Price Trend')
axes[0].set_ylabel('Price (USD)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Volume
axes[1].bar(range(len(df_clean)), df_clean['volume'], width=1.0, alpha=0.6)
axes[1].set_title(f'{symbol}_{interval} - Volume')
axes[1].set_xlabel('Time (15min candles)')
axes[1].set_ylabel('Volume')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f'[Visualization] Chart displayed')

## Step 6: Returns Distribution

In [None]:
# Calculate returns
df_clean['returns'] = df_clean['close'].pct_change() * 100  # in percentage

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Distribution
axes[0].hist(df_clean['returns'].dropna(), bins=100, alpha=0.7, edgecolor='black')
axes[0].set_title('Returns Distribution')
axes[0].set_xlabel('Returns (%)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df_clean['returns'].mean(), color='r', linestyle='--', label=f'Mean: {df_clean["returns"].mean():.4f}%')
axes[0].legend()

# Time series
axes[1].plot(df_clean['returns'], linewidth=0.5, alpha=0.7)
axes[1].set_title('Returns Over Time')
axes[1].set_xlabel('Time')
axes[1].set_ylabel('Returns (%)')
axes[1].axhline(0, color='k', linestyle='-', linewidth=0.3)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f'[Returns] Mean: {df_clean["returns"].mean():.6f}%')
print(f'[Returns] Std: {df_clean["returns"].std():.6f}%')
print(f'[Returns] Min: {df_clean["returns"].min():.6f}%')
print(f'[Returns] Max: {df_clean["returns"].max():.6f}%')

## Step 7: Test Data Windowing

In [None]:
# Create windows
window_size = 100  # 100 * 15min = 1500min = 25 hours

numeric_cols = ['open', 'high', 'low', 'close', 'volume']
data_matrix = df_clean[numeric_cols].values

windows = []
for i in range(0, len(data_matrix) - window_size + 1, 1):
    window = data_matrix[i:i+window_size]
    windows.append(window)

print(f'[Windowing] Created {len(windows)} windows')
print(f'[Windowing] Window size: {window_size} (shape: {windows[0].shape})')
print(f'[Windowing] Total features per window: {window_size * 5} (100 candles * 5 OHLCV)')

# Show sample window
print(f'\n[Sample] First window (first 5 candles):')
print(pd.DataFrame(windows[0][:5], columns=['Open', 'High', 'Low', 'Close', 'Volume']))

## Step 8: Train/Val/Test Split

In [None]:
# Time-based split
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

n = len(df_clean)
train_end = int(n * train_ratio)
val_end = train_end + int(n * val_ratio)

train_df = df_clean[:train_end]
val_df = df_clean[train_end:val_end]
test_df = df_clean[val_end:]

print(f'[Split] Total: {len(df_clean)} rows')
print(f'[Split] Train: {len(train_df)} ({train_ratio*100:.1f}%)')
print(f'[Split] Val:   {len(val_df)} ({val_ratio*100:.1f}%)')
print(f'[Split] Test:  {len(test_df)} ({test_ratio*100:.1f}%)')

if 'timestamp' in df_clean.columns:
    print(f'\n[Train] {train_df["timestamp"].min()} to {train_df["timestamp"].max()}')
    print(f'[Val]   {val_df["timestamp"].min()} to {val_df["timestamp"].max()}')
    print(f'[Test]  {test_df["timestamp"].min()} to {test_df["timestamp"].max()}')

## Step 9: Summary

In [None]:
print('\n' + '='*70)
print('DATA EXPLORATION COMPLETE')
print('='*70)

summary = {
    'Dataset': f'{symbol}_{interval}',
    'Total Rows': len(df_clean),
    'Features': 'OHLCV',
    'Window Size': window_size,
    'Total Windows': len(windows),
    'Train Rows': len(train_df),
    'Val Rows': len(val_df),
    'Test Rows': len(test_df),
    'Avg Return': f"{df_clean['returns'].mean():.6f}%",
    'Return Std': f"{df_clean['returns'].std():.6f}%",
}

for key, val in summary.items():
    print(f'{key:.<30} {val}')

print('='*70)