In [None]:
#!pip install pandas pyarrow scipy

In [None]:
import numpy as np
import sys, os
import pandas as pd
import plotly.express as px
import random
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count

from data_downloader import download, get_filename
from pattern_analysis import get_alpha_lambda, get_rmse, create_window
from pattern_searcher import PatternSearcher

In [None]:
ticker = 'AAPL'
interval = '5s'
year = '2024'
# Set the random seed for reproducibility
random.seed(42)

In [None]:
# download data to the file on disk.
filename = get_filename(ticker, interval, year)
if not os.path.exists(filename):
	download(ticker, interval, year)
# read data from the file on disk.
filename = get_filename(ticker, interval, year)
df = pd.read_parquet(filename)

# show data.
# fig = px.line(df[-1000:], y='open', title=f'{ticker} Open Prices')
# fig.show()
# total_bars = df.shape[0]
# print(f'Total bars: {total_bars} ({total_bars:,})')

In [None]:
m = 60 # window size

In [None]:
# Instantiate the optimized PatternSearcher with the 'open' column and fixed template length
searcher = PatternSearcher(df['open'], template_length=m)

# # Print searcher statistics
# print("PatternSearcher Statistics:")
# stats = searcher.get_stats()
# for key, value in stats.items():
#     print(f"  {key}: {value}")

In [None]:
N = 100
correlations = []
# get correlations for N randomly sampled patterns
for _ in tqdm(range(N)):
    start_index = random.randrange(0, len(df) - m)
    pattern = create_window(df, start_index, m)
    correlations.append({
        'start_index': start_index,
        'similar': searcher.get_rs_above(pattern, 0.97)
    })


In [None]:
# Function to process a single pattern search (for parallel execution)
def process_pattern_batch(batch_indices, df_data, template_length, threshold=0.97):
    """
    Process a batch of pattern searches.
    This function will be executed in parallel processes.
    
    Args:
        batch_indices: List of start indices for patterns
        df_data: DataFrame with the data (passed to avoid pickle issues)
        template_length: Length of the pattern window
        threshold: Correlation threshold
    """
    results = []
    for start_index in batch_indices:
        pattern = create_window(df_data, start_index, template_length)
        results.append({
            'start_index': start_index,
            'similar': searcher.get_rs_above(pattern, threshold)
        })
    
    return results# Parallel version of pattern search

def parallel_pattern_search(N, df, m, threshold=0.97):
    """
    Parallel version of the pattern search loop.
    
    Args:
        N: Number of patterns to analyze
        df: DataFrame with data
        m: Window size
        threshold: Correlation threshold
        max_workers: Number of worker processes (None = auto-detect CPU count)
    """
    # Generate all random indices at once for reproducibility
    random.seed(42)  # Reset seed for consistency
    all_indices = [random.randrange(0, len(df) - m) for _ in range(N)]
    
    # Split indices into batches for parallel processing
    max_workers = 8
    batch_size = max(1, N // max_workers)
    batches = [all_indices[i:i + batch_size] for i in range(0, N, batch_size)]
    
    correlations = []
    
    # Use ProcessPoolExecutor for parallel execution
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit all batches
        future_to_batch = {
            executor.submit(process_pattern_batch, searcher, batch, df, m, threshold): i 
            for i, batch in enumerate(batches)
        }
        
        # Collect results with progress bar
        with tqdm(total=len(batches), desc="Processing batches") as pbar:
            for future in as_completed(future_to_batch):
                batch_results = future.result()
                correlations.extend(batch_results)
                pbar.update(1)
    
    return correlations

In [None]:
# Run parallel version
import time

N = 1000

# Measure time for parallel execution
start_time = time.time()
correlations_parallel = parallel_pattern_search(N, df, m, threshold=0.97)
parallel_time = time.time() - start_time

print(f"Parallel execution completed in {parallel_time:.2f} seconds")
print(f"Processed {len(correlations_parallel)} patterns")

# Verify results are consistent (optional)
total_analyzed_parallel = sum(len(corr['similar']) for corr in correlations_parallel)
print(f'Total analyzed points (parallel): {total_analyzed_parallel}')

In [None]:
# total_analyzed = 0
# for corr in correlations:
#     print(f'index = {corr["start_index"]}, similar patterns: {len(corr["similar"])}')
#     total_analyzed += len(corr['similar'])
# print(f'Total analyzed points: {total_analyzed}')