# Trading Data Exploration

This notebook explores the USATECH index data and performs initial analysis to understand the dataset.

In [None]:
# Add parent directory to path to import from src
import sys
import os
sys.path.append(os.path.abspath('..'))

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from src modules
from src.data.loader import load_data, preprocess_data
from src.utils.helpers import set_pandas_display_options

# Set display options
set_pandas_display_options()

# Matplotlib settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 6]
%matplotlib inline

## 1. Load Data

Load the 15-minute and 5-minute USATECH index data.

In [None]:
# Load 15-minute data
file_path_15m = '../USATECH.IDXUSD_Candlestick_15_M_BID_01.01.2023-18.01.2025.csv'
df_15m = load_data(file_path_15m)
df_15m = preprocess_data(df_15m)

# Load 5-minute data
file_path_5m = '../USATECH.IDXUSD_Candlestick_5_M_BID_01.01.2023-18.01.2025.csv'
df_5m = load_data(file_path_5m)
df_5m = preprocess_data(df_5m)

# Print basic info
print(f"15-minute data: {len(df_15m)} rows from {df_15m.index.min()} to {df_15m.index.max()}")
print(f"5-minute data: {len(df_5m)} rows from {df_5m.index.min()} to {df_5m.index.max()}")

## 2. Data Overview

Let's examine the first few rows of the data to understand its structure.

In [None]:
# Display first few rows of 15-minute data
print("15-minute data sample:")
df_15m.head()

In [None]:
# Check for missing values
print("Missing values in 15-minute data:")
df_15m.isna().sum()

In [None]:
# Statistical summary of 15-minute data
df_15m.describe()

## 3. Price Visualization

Let's visualize the price data to get a better understanding of the trends and patterns.

In [None]:
# Plot closing price for the entire period
plt.figure(figsize=(14, 7))
plt.plot(df_15m.index, df_15m['Close'])
plt.title('USATECH Index 15-Minute Closing Price')
plt.xlabel('Date')
plt.ylabel('Price')
plt.grid(True)
plt.show()

In [None]:
# Add some basic technical indicators for visualization
df_15m['SMA20'] = df_15m['Close'].rolling(window=20).mean()
df_15m['SMA50'] = df_15m['Close'].rolling(window=50).mean()
df_15m['SMA200'] = df_15m['Close'].rolling(window=200).mean()

# Plot recent data with moving averages
recent_data = df_15m.iloc[-500:]
plt.figure(figsize=(14, 7))
plt.plot(recent_data.index, recent_data['Close'], label='Close')
plt.plot(recent_data.index, recent_data['SMA20'], label='SMA20')
plt.plot(recent_data.index, recent_data['SMA50'], label='SMA50')
plt.plot(recent_data.index, recent_data['SMA200'], label='SMA200')
plt.title('USATECH Index with Moving Averages (Last 500 Periods)')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.show()

## 4. Daily Returns Analysis

Let's analyze the daily returns to understand the volatility and distribution.

In [None]:
# Calculate daily returns
df_15m['Returns'] = df_15m['Close'].pct_change() * 100

# Plot returns distribution
plt.figure(figsize=(12, 6))
plt.hist(df_15m['Returns'].dropna(), bins=100, alpha=0.75)
plt.title('Distribution of 15-Minute Returns (%)')
plt.xlabel('Return (%)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Calculate summary statistics for returns
returns_stats = df_15m['Returns'].describe()
print("Returns Statistics:")
print(returns_stats)

# Calculate additional metrics
print(f"Skewness: {df_15m['Returns'].skew():.4f}")
print(f"Kurtosis: {df_15m['Returns'].kurtosis():.4f}")

## 5. Volatility Analysis

Let's analyze the volatility of the index over time.

In [None]:
# Calculate rolling volatility (standard deviation of returns)
window = 20 * 24  # Approximately 20 trading days (assuming 24 15-min periods per day)
df_15m['Volatility'] = df_15m['Returns'].rolling(window=window).std() * np.sqrt(window)

# Plot volatility over time
plt.figure(figsize=(14, 7))
plt.plot(df_15m.index, df_15m['Volatility'].dropna())
plt.title(f'USATECH Index {window}-Period Rolling Volatility (Annualized)')
plt.xlabel('Date')
plt.ylabel('Volatility (%)')
plt.grid(True)
plt.show()

## 6. Volume Analysis

Let's analyze the trading volume patterns.

In [None]:
# Plot volume
plt.figure(figsize=(14, 7))
plt.bar(df_15m.index, df_15m['Volume'], alpha=0.5)
plt.title('USATECH Index Trading Volume')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.grid(True)
plt.show()

In [None]:
# Calculate rolling average volume
df_15m['Volume_SMA20'] = df_15m['Volume'].rolling(window=20).mean()

# Plot recent volume with moving average
recent_data = df_15m.iloc[-500:]
plt.figure(figsize=(14, 7))
plt.bar(recent_data.index, recent_data['Volume'], alpha=0.5, label='Volume')
plt.plot(recent_data.index, recent_data['Volume_SMA20'], color='red', label='20-Period MA')
plt.title('USATECH Index Trading Volume (Last 500 Periods)')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.legend()
plt.grid(True)
plt.show()

## 7. Time-of-Day Analysis

Let's analyze if there are patterns based on the time of day.

In [None]:
# Extract hour of day
df_15m['Hour'] = df_15m.index.hour

# Analyze returns by hour
hourly_returns = df_15m.groupby('Hour')['Returns'].agg(['mean', 'std', 'count'])
hourly_returns.columns = ['Average Return (%)', 'Std Dev (%)', 'Count']
hourly_returns

In [None]:
# Plot average returns by hour
plt.figure(figsize=(14, 7))
plt.bar(hourly_returns.index, hourly_returns['Average Return (%)'])
plt.title('Average Returns by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Average Return (%)')
plt.grid(True)
plt.xticks(range(24))
plt.show()

In [None]:
# Plot average volume by hour
hourly_volume = df_15m.groupby('Hour')['Volume'].mean()
plt.figure(figsize=(14, 7))
plt.bar(hourly_volume.index, hourly_volume)
plt.title('Average Volume by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Average Volume')
plt.grid(True)
plt.xticks(range(24))
plt.show()

## 8. Day of Week Analysis

Let's analyze if there are patterns based on the day of the week.

In [None]:
# Extract day of week
df_15m['DayOfWeek'] = df_15m.index.dayofweek
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Analyze returns by day of week
day_returns = df_15m.groupby('DayOfWeek')['Returns'].agg(['mean', 'std', 'count'])
day_returns.index = [days[i] for i in day_returns.index]
day_returns.columns = ['Average Return (%)', 'Std Dev (%)', 'Count']
day_returns

In [None]:
# Plot average returns by day of week
plt.figure(figsize=(12, 6))
plt.bar(day_returns.index, day_returns['Average Return (%)'])
plt.title('Average Returns by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Return (%)')
plt.grid(True)
plt.show()

## 9. Correlation Analysis

Let's analyze the correlation between different variables.

In [None]:
# Calculate correlations
df_corr = df_15m[['Open', 'High', 'Low', 'Close', 'Volume', 'Returns', 'Volatility']].dropna().corr()
df_corr

In [None]:
# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

## 10. Moving Average Crossover Analysis

Let's identify and visualize moving average crossovers, which are common trading signals.

In [None]:
# Identify MA crossovers
df_15m['Golden_Cross'] = ((df_15m['SMA50'].shift(1) < df_15m['SMA200'].shift(1)) & 
                         (df_15m['SMA50'] >= df_15m['SMA200'])).astype(int)
df_15m['Death_Cross'] = ((df_15m['SMA50'].shift(1) > df_15m['SMA200'].shift(1)) & 
                        (df_15m['SMA50'] <= df_15m['SMA200'])).astype(int)

# Find crossover dates
golden_cross_dates = df_15m[df_15m['Golden_Cross'] == 1].index
death_cross_dates = df_15m[df_15m['Death_Cross'] == 1].index

print(f"Found {len(golden_cross_dates)} golden crosses and {len(death_cross_dates)} death crosses")

# Print first few crossover dates
print("\nSome Golden Cross dates:")
for date in golden_cross_dates[:5]:
    print(date)

print("\nSome Death Cross dates:")
for date in death_cross_dates[:5]:
    print(date)

In [None]:
# Plot price with moving averages and crossovers
plt.figure(figsize=(14, 7))
plt.plot(df_15m.index, df_15m['Close'], label='Close', alpha=0.5)
plt.plot(df_15m.index, df_15m['SMA50'], label='SMA50', alpha=0.8)
plt.plot(df_15m.index, df_15m['SMA200'], label='SMA200', alpha=0.8)
plt.scatter(golden_cross_dates, df_15m.loc[golden_cross_dates, 'Close'], color='green', s=50, label='Golden Cross')
plt.scatter(death_cross_dates, df_15m.loc[death_cross_dates, 'Close'], color='red', s=50, label='Death Cross')
plt.title('USATECH Index with Moving Average Crossovers')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Analyze returns after crossovers
forward_periods = [1, 5, 10, 20, 50]
results = []

for periods in forward_periods:
    # Golden Cross returns
    golden_future_returns = []
    for date in golden_cross_dates:
        idx = df_15m.index.get_loc(date)
        if idx + periods < len(df_15m):
            future_return = (df_15m['Close'].iloc[idx + periods] / df_15m['Close'].iloc[idx] - 1) * 100
            golden_future_returns.append(future_return)
    
    # Death Cross returns
    death_future_returns = []
    for date in death_cross_dates:
        idx = df_15m.index.get_loc(date)
        if idx + periods < len(df_15m):
            future_return = (df_15m['Close'].iloc[idx + periods] / df_15m['Close'].iloc[idx] - 1) * 100
            death_future_returns.append(future_return)
    
    results.append({
        'Periods': periods,
        'Golden_Cross_Avg_Return': np.mean(golden_future_returns) if golden_future_returns else np.nan,
        'Death_Cross_Avg_Return': np.mean(death_future_returns) if death_future_returns else np.nan,
        'Golden_Cross_Win_Rate': np.mean([r > 0 for r in golden_future_returns]) * 100 if golden_future_returns else np.nan,
        'Death_Cross_Win_Rate': np.mean([r < 0 for r in death_future_returns]) * 100 if death_future_returns else np.nan
    })

# Display results
df_results = pd.DataFrame(results)
df_results

## Summary and Conclusions

Based on the analysis above, here are some key observations about the USATECH index data:

1. The dataset provides good coverage from January 2023 to January 2025 with both 5-minute and 15-minute timeframes.
2. [Add your observations here based on the analysis results]
3. [Add more observations]

These insights will inform our feature engineering and model development process in subsequent notebooks.