In [None]:
#!pip install pandas pyarrow scipy

In [None]:
import numpy as np
import sys, os
import pandas as pd
import plotly.express as px
import random
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count
import plotly.graph_objects as go

from data_downloader import download, get_filename
from pattern_analysis import get_alpha_lambda, get_rmse, create_window
from pattern_searcher import PatternSearcher
from trailing_stop_loss import generate_stop_loss_levels
from trailing_stop_loss import calculate_pl

In [None]:
year = '2024'
ticker = 'AAPL'
interval = '5s'

# Set the random seed for reproducibility
random.seed(42)
# download data to the file on disk.
filename = get_filename(ticker, interval, year)
if not os.path.exists(filename):
	download(ticker, interval, year)
# read data from the file on disk.
df = pd.read_parquet(filename)

# show data.
# fig = px.line(df[-1000:], y='open', title=f'{ticker} Open Prices')
# fig.show()
total_bars = df.shape[0]
print(f'Total bars: {total_bars} ({total_bars:,})')

In [None]:
m = 60 # window size

In [None]:
# Instantiate the optimized PatternSearcher with the 'open' column and fixed template length
searcher = PatternSearcher(df['open'], template_length=m)

# # Print searcher statistics
# print("PatternSearcher Statistics:")
# stats = searcher.get_stats()
# for key, value in stats.items():
#     print(f"  {key}: {value}")

In [None]:
max_sl = 0.3
sl_steps = 30
stop_loss_percents, stop_loss_levels = generate_stop_loss_levels(max_sl, sl_steps)

In [None]:
tsl_profits = calculate_pl(df, stop_loss_levels)

In [None]:
all_indices = np.arange(len(df['open']), dtype=np.int64)
np.random.shuffle(all_indices)
all_indices

In [None]:
results = []
seen = np.zeros(len(df), dtype=int)
start_from = 0

In [None]:
N = 1000
tqdm.write(f'starting from {start_from}')
for i in tqdm(range(start_from, start_from + N)):
    start_index = all_indices[i]
    # # (optional) optimization to not analyse similar points
    # if seen[start_index] > 0:
    #     continue
    pattern = create_window(df, start_index, m)
    r = searcher.search(pattern) # correlations
    r_limit = 0.98
    above_limit_mask = np.abs(r) > r_limit
    # start points of similar windows
    similar_starts = np.where(above_limit_mask)[0]
    if len(similar_starts) > 100:
        # entry points for trading (after pattern ends)
        entry_points = similar_starts + m
        profits_means = tsl_profits[:, entry_points].mean(axis=1)
        profits_max = max(profits_means)
        profits_stds = tsl_profits[:, entry_points].std(axis=1)
        results.append((start_index, profits_max, len(similar_starts), profits_means, profits_stds))
    # if there is only one point with r > r_limit then it's the start_index with r == 1
    if len(similar_starts) == 1:
        # only one match (the pattern itself), mark just this index
        seen[start_index] = 1
    else:
        # multiple matches, mark all highly correlated patterns
        high_corr_mask = np.abs(r) > 0.98
        high_corr_starts = np.where(high_corr_mask)[0]
        seen[high_corr_starts] = 1
    start_from += 1
for best in sorted(results, key=lambda x: x[1], reverse=True)[:3]:
    print(f'start_index = {best[0]}, max mean profit = {best[1]}, similar points = {best[2]}')
    print(f'mean tsl profits:\n{best[3]}')


In [None]:
sum(seen)

In [None]:
len(results)

In [None]:
# # Function to process a single pattern search (for parallel execution)
# def process_pattern_batch(batch_indices, df_data, template_length, threshold=0.97):
#     """
#     Process a batch of pattern searches.
#     This function will be executed in parallel processes.
    
#     Args:
#         batch_indices: List of start indices for patterns
#         df_data: DataFrame with the data (passed to avoid pickle issues)
#         template_length: Length of the pattern window
#         threshold: Correlation threshold
#     """
#     results = []
#     for start_index in batch_indices:
#         pattern = create_window(df_data, start_index, template_length)
#         results.append({
#             'start_index': start_index,
#             'similar': searcher.get_rs_above(pattern, threshold)
#         })
    
#     return results# Parallel version of pattern search

# def parallel_pattern_search(N, df, m, threshold=0.97):
#     """
#     Parallel version of the pattern search loop.
    
#     Args:
#         N: Number of patterns to analyze
#         df: DataFrame with data
#         m: Window size
#         threshold: Correlation threshold
#         max_workers: Number of worker processes (None = auto-detect CPU count)
#     """
#     # Generate all random indices at once for reproducibility
#     random.seed(42)  # Reset seed for consistency
#     all_indices = [random.randrange(0, len(df) - m) for _ in range(N)]
    
#     # Split indices into batches for parallel processing
#     max_workers = 8
#     batch_size = max(1, N // max_workers)
#     batches = [all_indices[i:i + batch_size] for i in range(0, N, batch_size)]
    
#     correlations = []
    
#     # Use ProcessPoolExecutor for parallel execution
#     with ProcessPoolExecutor(max_workers=max_workers) as executor:
#         # Submit all batches
#         future_to_batch = {
#             executor.submit(process_pattern_batch, searcher, batch, df, m, threshold): i 
#             for i, batch in enumerate(batches)
#         }
        
#         # Collect results with progress bar
#         with tqdm(total=len(batches), desc="Processing batches") as pbar:
#             for future in as_completed(future_to_batch):
#                 batch_results = future.result()
#                 correlations.extend(batch_results)
#                 pbar.update(1)
    
#     return correlations

In [None]:
# # Run parallel version
# import time

# N = 1000

# # Measure time for parallel execution
# start_time = time.time()
# correlations_parallel = parallel_pattern_search(N, df, m, threshold=0.97)
# parallel_time = time.time() - start_time

# print(f"Parallel execution completed in {parallel_time:.2f} seconds")
# print(f"Processed {len(correlations_parallel)} patterns")

# # Verify results are consistent (optional)
# total_analyzed_parallel = sum(len(corr['similar']) for corr in correlations_parallel)
# print(f'Total analyzed points (parallel): {total_analyzed_parallel}')

In [None]:
# total_analyzed = 0
# for corr in correlations:
#     print(f'index = {corr["start_index"]}, similar patterns: {len(corr["similar"])}')
#     total_analyzed += len(corr['similar'])
# print(f'Total analyzed points: {total_analyzed}')

# TEST ONE RESULT NEXT YEAR!!!11

In [None]:
year = 2024
filename = get_filename(ticker, interval, year)
if not os.path.exists(filename):
	download(ticker, interval, year)
# read data from the file on disk.
df = pd.read_parquet(filename)

# # show data.
# fig = px.line(df[-1000:], y='open', title=f'{ticker} Open Prices')
# fig.show()
total_bars = df.shape[0]
print(f'Total bars: {total_bars} ({total_bars:,})')

start_index = 834009 # 834009 # 834009
temaplate = create_window(df, start_index, m)

fig = go.Figure()
fig.add_trace(go.Scatter(y=temaplate))
fig.show()

In [None]:
year = 2025
filename = get_filename(ticker, interval, year)
if not os.path.exists(filename):
	download(ticker, interval, year)
# read data from the file on disk.
df = pd.read_parquet(filename)

# # show data.
# fig = px.line(df[-1000:], y='open', title=f'{ticker} Open Prices')
# fig.show()
total_bars = df.shape[0]
print(f'Total bars: {total_bars} ({total_bars:,})')

In [None]:
searcher = PatternSearcher(df['open'], template_length=m)
r = searcher.search(temaplate)
r

In [None]:
tsl_profits = calculate_pl(df, stop_loss_levels)
tsl_profits.shape

In [None]:
r_limit = 0.98
above_limit_mask = np.abs(r) > r_limit
# start points of similar windows
similar_starts = np.where(above_limit_mask)[0]

len(similar_starts)

In [None]:
# entry points for trading (after pattern ends)
entry_points = similar_starts + m
profits_means = tsl_profits[:, entry_points].mean(axis=1)
profits_max = max(profits_means)
profits_stds = tsl_profits[:, entry_points].std(axis=1)
print(f'start_index = {start_index}, max mean profit = {profits_max}, similar points = {len(similar_starts)}')
# print(f'mean tsl profits:\n{list(zip(stop_loss_percents, profits_means))}')
for sl_pct, profit_mean in zip(stop_loss_percents, profits_means):
    print(f'  {sl_pct:.3f}: {profit_mean:.6f}')