In [None]:
import numpy as np
import plotly.graph_objects as go
from collections import defaultdict

from checkpoints import load_ckpt

# Force fixed-point notation instead of scientific notation in prints
np.set_printoptions(suppress=True, precision=6)

In [None]:
ckpt = load_ckpt('data', 'ckpt_20250819_014553.pkl.gz')
#ckpt = load_ckpt('data/ckpt_AAPL_5s_2024')
ckpt.keys()

#### check duplicates

In [None]:
# check duplicates

results = ckpt['results_pickle'] # [(starting_point, similar_count)]
orig_len = len(results)

# filter duplicates
seen_indices = set()
results_filtered = []
for item in results:
    start_index = item[0]
    if start_index not in seen_indices:
        seen_indices.add(start_index)
        results_filtered.append(item)

results = results_filtered

print(f"Original length: {orig_len}")
print(f"After dups are removed: {len(results)}")

#### first look into results

In [None]:
# x[0] = starting_point, x[1] = similar_count
sorted_by_similar_number = sorted(results, key=lambda x: x[1], reverse=True)
sorted_by_similar_number[:10]

#### distribution of similar counts

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter

# Extract all similar_number values (index 1)
similar_numbers = [item[1] for item in results]

# Count frequency of each value
counter = Counter(similar_numbers)

# Create chart with Plotly
values = list(counter.keys())
frequencies = list(counter.values())

figWidget = go.Figure()

figWidget.add_trace(go.Scatter(
    x=values,
    y=frequencies,
    mode='markers',
    marker=dict(
        size=8,
        opacity=0.7,
        color='blue'
    ),
    name='Frequency'
))

figWidget.update_layout(
    title='Distribution of Similar Pattern Counts',
    xaxis_title='Similar Number (count of similar patterns)',
    yaxis_title='Frequency (log scale)',
    yaxis_type="log",
    showlegend=False,
    height=600,
    width=900
)

figWidget.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
figWidget.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

figWidget.show()

print(f"Mean value: {sum(similar_numbers) / len(similar_numbers):.2f}")

#### distribution of profits

In [None]:
# # Group data by similar_number (index 1)
# grouped_data = defaultdict(list)
# for item in results:
#     similar_num = item[1]
#     max_profit = item[1]
#     grouped_data[similar_num].append(max_profit)

# # Calculate statistics for each group
# similar_numbers = []
# min_profits = []
# max_profits = []
# mean_profits = []

# for similar_num in sorted(grouped_data.keys()):
#     profits = grouped_data[similar_num]
#     similar_numbers.append(similar_num)
#     min_profits.append(min(profits))
#     max_profits.append(max(profits))
#     mean_profits.append(np.mean(profits))

# # Create chart with Plotly
# figWidget = go.Figure()

# # Minimum points
# figWidget.add_trace(go.Scatter(
#     x=similar_numbers,
#     y=min_profits,
#     mode='markers',
#     marker=dict(color='blue', size=6, opacity=0.7),
#     name='Minimum'
# ))

# # Maximum points
# figWidget.add_trace(go.Scatter(
#     x=similar_numbers,
#     y=max_profits,
#     mode='markers',
#     marker=dict(color='green', size=6, opacity=0.7),
#     name='Maximum'
# ))

# # Mean value line
# figWidget.add_trace(go.Scatter(
#     x=similar_numbers,
#     y=mean_profits,
#     mode='lines+markers',
#     line=dict(color='red', width=2),
#     marker=dict(color='red', size=4),
#     name='Mean Value'
# ))

# figWidget.update_layout(
#     title='Range of Maximum Profits by Similar Number',
#     xaxis_title='Similar Number (count of similar patterns)',
#     yaxis_title='Maximum Profit',
#     height=600,
#     width=1000,
#     legend=dict(x=0.02, y=0.98)
# )

# figWidget.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
# figWidget.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# figWidget.show()

#### export indexes with their similar counts

In [None]:
# x[0] = starting_point, x[1] = similar_count
sorted_results = sorted(results, key=lambda x: (x[1], -x[0]), reverse=True)
np.savetxt(
    'data/AAPL_2024_5s_index_similar.csv',
    sorted_results,
    delimiter=',',
    fmt='%d',
    header='start_index,similar_number',
    comments='')
# double check the sorting
sorted_results_np = np.array(sorted_results)
print(np.array2string(sorted_results_np))

#### check the top result

In [None]:
from data_downloader import download, get_filename
from pattern_searcher import PatternSearcher
from pattern_analysis import create_window
import pandas as pd
import sys, os

ticker = 'AAPL'
interval = '5s'
year = '2024'
m = 60 # window size

print('loading prices')
# download data to the file on disk.
filename = get_filename(ticker, interval, year)
if not os.path.exists(filename):
	download(ticker, interval, year)
# read data from the file on disk.
filename = get_filename(ticker, interval, year)
df = pd.read_parquet(filename)

start_index = sorted_results[0][0]
expected_similar_count = sorted_results[0][1]
print(f'for the data point {start_index} we expect to have {expected_similar_count} similar points')
searcher = PatternSearcher(df['open'], template_length=m)
pattern = create_window(df, start_index, m)
r = searcher.search(pattern) # correlations
found_similar_count = len(r[np.abs(r) > 0.98])
print(f'found similar: {found_similar_count}. {"match" if found_similar_count == expected_similar_count else "missmatch"}')
indices = np.where(np.abs(r) > 0.99)[0]
print(f'{len(indices)} points with r above 0.99: {indices}')

In [None]:
# check that all r > 0.99 are present in the results
for index in indices:
    print([row for row in sorted_results if row[0] == index])

#### tSNE results

In [None]:
# reduce the number of data points for tSNE
sorted_results_slice = sorted_results#[:1000]

In [None]:
# Extract pattern features
patterns = extract_pattern_features(df, sorted_results_slice, m=60)
# Calc t-SNE (takes 35 seconds)
tsne_coords = get_tsne(patterns)
# info for the popup on mouse hovering over a point
pattern_info = get_pattern_info(sorted_results_slice)

In [None]:
from trailing_stop_loss import calculate_pl
from trailing_stop_loss import generate_stop_loss_levels

# Set trailing stop-loss levels
max_sl = 1
sl_steps = 10
stop_loss_percents, stop_loss_levels = generate_stop_loss_levels(max_sl, sl_steps)
tsl_profits = calculate_pl(df, stop_loss_levels)
stop_loss_percents

In [None]:
searcher = PatternSearcher(df['open'], template_length=m)
r_limit = 0.98
for i, result_item in enumerate(sorted_results_slice):
    start_index = result_item[0]
    pattern = create_window(df, start_index, m)
    # get similar starting points as a mask
    r = searcher.search(pattern) # correlations
    mask = r > r_limit
    # shift mask on m point forward to get entry points.
    entries_mask = np.concatenate([np.array([False] * (m - 1)), mask])
    # apply the mask on tsl_profits
    masked_profits = tsl_profits[:, entries_mask]
    # get averages
    averages = masked_profits.mean(axis=1)
    # get max pl
    max_pnl_index = np.argmax(averages)
    max_pnl = averages[max_pnl_index]
    # add it to the pattern_info to use in tSNE visualization later.
    pattern_info[i]['max_pnl'] = max_pnl.item()
    pattern_info[i]['max_pnl_index'] = max_pnl_index
    pattern_info[i]['sl'] = stop_loss_percents[max_pnl_index]
    pattern_info[i]['side'] = 'long' if pattern_info[i]['sl'] < 0 else 'short'

In [None]:
# plot tSNE
plot_data, figWidget = visualize_clusters_tsne(tsne_coords, pattern_info, 'max_pnl')
figWidget

In [None]:
plot_data, figWidget = visualize_clusters_tsne(tsne_coords, pattern_info, 'max_pnl_index')
figWidget

In [None]:
plot_selected() # starting point 770975, 1259400, 1244137

In [None]:
def print_avg_pls():
    if len(click_log) > 0:
        last_click_log = click_log[-1]
        clicked_trace = last_click_log['trace']
        clicked_index = last_click_log['index']
        selected_staring_point = int(clicked_trace['customdata'][clicked_index][0])
        #selected_staring_point = 1244137
        pattern = create_window(df, selected_staring_point, m)
        # get similar starting points as a mask
        r = searcher.search(pattern) # correlations
        mask = r > r_limit
        print(f'mask sum: {mask.sum()}')
        # shift mask on m point forward to get entry points.
        entries_mask = np.concatenate([np.array([False] * m), mask[:-1]])
        # apply the mask on tsl_profits
        masked_profits = tsl_profits[:, entries_mask]
        # get averages
        averages = masked_profits.mean(axis=1)
        print(f'starting point: {selected_staring_point}, PLs: {averages}')
    else:
        print('select a point on tSNE')

print_avg_pls()