# Tick data analysis

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import time
from joblib import Memory
warnings.filterwarnings('ignore')

path = Path("EURUSD.parquet")
# path = Path("EURUSD.uncompressed.h5")
# path = Path("/media/pete/ramdisk/EURUSD.uncompressed.parquet")

memory = Memory(location=".cachedir", verbose=0)

@memory.cache
def load_data():
    return pd.read_parquet(path, engine="pyarrow")

if path.exists():
    print(f"Loading parquet file: {path}")
    %time df = load_data()
else:
    print(f"File {path} not found!")


# Display basic information about the dataset
print("=== DATASET INFO ===")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n=== COLUMN TYPES ===")
print(df.dtypes)
print("\n=== MISSING VALUES ===")
%time missing_data = df.isnull().sum()
print(missing_data[missing_data > 0]) if any(missing_data > 0) else print("No missing values found")

Loading parquet file: EURUSD.parquet
CPU times: user 1.42 s, sys: 1.16 s, total: 2.59 s
Wall time: 2.59 s
=== DATASET INFO ===
Shape: (29186310, 5)
Memory usage: 3284.44 MB

=== COLUMN TYPES ===
UTC           object
AskPrice     float64
BidPrice     float64
AskVolume    float64
BidVolume    float64
dtype: object

=== MISSING VALUES ===
CPU times: user 779 ms, sys: 12.1 ms, total: 791 ms
Wall time: 789 ms
No missing values found


## Display Top and Tail of Data

In [None]:
# Display top 10 rows
print("=== TOP 10 ROWS ===")
display(df.head(10))

# Display bottom 10 rows
print("\n=== BOTTOM 10 ROWS ===")
display(df.tail(10))

## Statistical Summary

In [None]:
# Calculate and display statistical summary
print("=== STATISTICAL SUMMARY ===")
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    summary = df[numeric_cols].describe()
    display(summary)
else:
    print("No numeric columns found for statistical summary")

## Data Quality Checks

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Check for unique values in each column
print("\n=== UNIQUE VALUES PER COLUMN ===")
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")
    
# Check data ranges for numeric columns
print("\n=== DATA RANGES FOR NUMERIC COLUMNS ===")
for col in numeric_cols:
    min_val = df[col].min()
    max_val = df[col].max()
    print(f"{col}: {min_val:.4f} to {max_val:.4f}")

## Time Series Analysis (if applicable)

In [None]:
# Check for datetime columns
datetime_cols = df.select_dtypes(include=['datetime64']).columns
if len(datetime_cols) > 0:
    print("=== TIME SERIES ANALYSIS ===")
    for col in datetime_cols:
        print(f"Column: {col}")
        print(f"Date range: {df[col].min()} to {df[col].max()}")
        print(f"Frequency: {len(df)} records over {(df[col].max() - df[col].min()).total_seconds()/3600:.1f} hours")
        print()
else:
    # Try to parse potential datetime columns
    for col in df.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            try:
                df[col] = pd.to_datetime(df[col])
                print(f"Converted {col} to datetime")
                print(f"Date range: {df[col].min()} to {df[col].max()}")
                break
            except:
                continue

## Correlation Analysis

In [None]:
# Calculate correlations for numeric columns
if len(numeric_cols) > 1:
    print("=== CORRELATION MATRIX ===")
    correlation_matrix = df[numeric_cols].corr()
    display(correlation_matrix)
    
    # Visualize correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

## Distribution Analysis

In [None]:
# Plot distributions for numeric columns
if len(numeric_cols) > 0:
    print("=== DISTRIBUTION PLOTS ===")
    n_cols = min(3, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_rows > 1 or n_cols > 1 else [axes]
    
    for idx, col in enumerate(numeric_cols):
        if idx < len(axes):
            axes[idx].hist(df[col], bins=30, alpha=0.7, edgecolor='black')
            axes[idx].set_title(f'Distribution of {col}')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
    
    # Hide empty subplots
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

## Outlier Detection

In [None]:
# Detect outliers using IQR method
print("=== OUTLIER DETECTION (IQR Method) ===")
outlier_summary = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_count = len(outliers)
    outlier_pct = (outlier_count / len(df)) * 100
    
    outlier_summary[col] = {
        'count': outlier_count,
        'percentage': outlier_pct,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound
    }
    
    print(f"{col}: {outlier_count} outliers ({outlier_pct:.2f}%)")

# Display outlier details for columns with outliers
for col, info in outlier_summary.items():
    if info['count'] > 0:
        print(f"\n{col} outlier bounds: {info['lower_bound']:.4f} to {info['upper_bound']:.4f}")

## Summary Report

In [None]:
# Generate a comprehensive summary
print("=== SUMMARY REPORT ===")
print(f"Dataset: {path}")
print(f"Total records: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

if len(numeric_cols) > 0:
    print(f"\nNumeric columns: {len(numeric_cols)}")
    for col in numeric_cols:
        print(f"  - {col}: {df[col].mean():.4f} ± {df[col].std():.4f}")

print("\n=== ANALYSIS COMPLETE ===")