In [None]:
import numpy as np
import plotly.graph_objects as go
from collections import defaultdict

from checkpoints import load_ckpt

In [None]:
ckpt = load_ckpt('data', 'ckpt_20250819_014553.pkl.gz')
#ckpt = load_ckpt('data/ckpt_AAPL_5s_2024')
ckpt.keys()

#### check duplicates

In [None]:
# check duplicates

results = ckpt['results_pickle']
orig_len = len(results)

# filter duplicates
seen_indices = set()
results_filtered = []
for item in results:
    start_index = item[0]
    if start_index not in seen_indices:
        seen_indices.add(start_index)
        results_filtered.append(item)

results = results_filtered

print(f"Original length: {orig_len}")
print(f"After dups are removed: {len(results)}")

#### first look into results

In [None]:
sorted_by_similar_number = sorted(results, key=lambda x: x[1], reverse=True)
sorted_by_similar_number[:10]

#### distribution of similar counts

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter

# Extract all similar_number values (index 1)
similar_numbers = [item[1] for item in results]

# Count frequency of each value
counter = Counter(similar_numbers)

# Create chart with Plotly
values = list(counter.keys())
frequencies = list(counter.values())

figWidget = go.Figure()

figWidget.add_trace(go.Scatter(
    x=values,
    y=frequencies,
    mode='markers',
    marker=dict(
        size=8,
        opacity=0.7,
        color='blue'
    ),
    name='Frequency'
))

figWidget.update_layout(
    title='Distribution of Similar Pattern Counts',
    xaxis_title='Similar Number (count of similar patterns)',
    yaxis_title='Frequency (log scale)',
    yaxis_type="log",
    showlegend=False,
    height=600,
    width=900
)

figWidget.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
figWidget.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

figWidget.show()

print(f"Mean value: {sum(similar_numbers) / len(similar_numbers):.2f}")

#### distribution of profits

In [None]:
# # Group data by similar_number (index 1)
# grouped_data = defaultdict(list)
# for item in results:
#     similar_num = item[1]
#     max_profit = item[1]
#     grouped_data[similar_num].append(max_profit)

# # Calculate statistics for each group
# similar_numbers = []
# min_profits = []
# max_profits = []
# mean_profits = []

# for similar_num in sorted(grouped_data.keys()):
#     profits = grouped_data[similar_num]
#     similar_numbers.append(similar_num)
#     min_profits.append(min(profits))
#     max_profits.append(max(profits))
#     mean_profits.append(np.mean(profits))

# # Create chart with Plotly
# figWidget = go.Figure()

# # Minimum points
# figWidget.add_trace(go.Scatter(
#     x=similar_numbers,
#     y=min_profits,
#     mode='markers',
#     marker=dict(color='blue', size=6, opacity=0.7),
#     name='Minimum'
# ))

# # Maximum points
# figWidget.add_trace(go.Scatter(
#     x=similar_numbers,
#     y=max_profits,
#     mode='markers',
#     marker=dict(color='green', size=6, opacity=0.7),
#     name='Maximum'
# ))

# # Mean value line
# figWidget.add_trace(go.Scatter(
#     x=similar_numbers,
#     y=mean_profits,
#     mode='lines+markers',
#     line=dict(color='red', width=2),
#     marker=dict(color='red', size=4),
#     name='Mean Value'
# ))

# figWidget.update_layout(
#     title='Range of Maximum Profits by Similar Number',
#     xaxis_title='Similar Number (count of similar patterns)',
#     yaxis_title='Maximum Profit',
#     height=600,
#     width=1000,
#     legend=dict(x=0.02, y=0.98)
# )

# figWidget.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
# figWidget.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# figWidget.show()

#### export indexes with their similar counts

In [None]:
sorted_results = sorted(results, key=lambda x: (x[1], -x[0]), reverse=True)
np.savetxt(
    'data/AAPL_2024_5s_index_similar.csv',
    sorted_results,
    delimiter=',',
    fmt='%d',
    header='start_index,similar_number',
    comments='')
# double check the sorting
sorted_results_np = np.array(sorted_results)
print(np.array2string(sorted_results_np))

#### check the top result

In [None]:
from data_downloader import download, get_filename
from pattern_searcher import PatternSearcher
from pattern_analysis import create_window
import pandas as pd
import sys, os

ticker = 'AAPL'
interval = '5s'
year = '2024'
m = 60 # window size

print('loading prices')
# download data to the file on disk.
filename = get_filename(ticker, interval, year)
if not os.path.exists(filename):
	download(ticker, interval, year)
# read data from the file on disk.
filename = get_filename(ticker, interval, year)
df = pd.read_parquet(filename)

start_index = sorted_results[0][0]
expected_similar_count = sorted_results[0][1]
print(f'for the data point {start_index} we expect to have {expected_similar_count} similar points')
searcher = PatternSearcher(df['open'], template_length=m)
pattern = create_window(df, start_index, m)
r = searcher.search(pattern) # correlations
found_similar_count = len(r[np.abs(r) > 0.98])
print(f'found similar: {found_similar_count}. {"match" if found_similar_count == expected_similar_count else "missmatch"}')
indices = np.where(np.abs(r) > 0.99)[0]
print(f'{len(indices)} points with r above 0.99: {indices}')

In [None]:
# check that all r > 0.99 are present in the results
for index in indices:
    print([row for row in sorted_results if row[0] == index])

#### tSNE results

In [None]:
click_log = []
# !pip install scikit-learn pandas numpy plotly
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import plotly.express as px
from collections import defaultdict
import pandas as pd

def extract_pattern_features(df, sorted_results, m=60):
    """
    Extract feature vectors for each pattern in sorted_results.
    Uses the actual pattern values as features.
    """
    patterns = []
    pattern_info = []
    
    print(f"Extracting features for top {len(sorted_results)} patterns...")
    
    for i in range(len(sorted_results)):
        start_index = sorted_results[i][0]
        similar_count = sorted_results[i][1]
        
        # Extract the pattern window
        pattern = df['open'].iloc[start_index:start_index + m].values
        
        # Normalize pattern (mean=0, std=1) for better comparison
        pattern_normalized = (pattern - pattern.mean()) / pattern.std()
        
        patterns.append(pattern_normalized)
        pattern_info.append({
            'start_index': start_index,
            'similar_count': similar_count,
            'original_rank': i
        })
    
    return np.array(patterns), pattern_info

def handle_click(trace, points, state):
    global click_log
    for i in points.point_inds:
        record = {
            "index": i,
            "x": trace.x[i],
            "y": trace.y[i],
            "trace": trace,
            "custom": trace.customdata[i] if trace.customdata is not None else None
        }
        click_log.append(record)
        # from IPython.display import display
        # display(record)

def get_tsne(patterns):
    print("Computing t-SNE embedding...")
    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(patterns)-1))
    tsne_coords = tsne.fit_transform(patterns)
    return tsne_coords

def visualize_clusters_tsne(tsne_coords, pattern_info):
    """
    Visualize clusters using t-SNE dimensionality reduction.
    """
    # Create DataFrame for plotting
    plot_data = pd.DataFrame({
        'x': tsne_coords[:, 0],
        'y': tsne_coords[:, 1],
        'start_index': [info['start_index'] for info in pattern_info],
        'similar_count': [info['similar_count'] for info in pattern_info],
        'rank': [info['original_rank'] for info in pattern_info]
    })
    
    # Create interactive plot
    fig_px = px.scatter(
        plot_data, 
        x='x', y='y', 
        color='similar_count',
        hover_data=['start_index', 'similar_count', 'rank'],
        # width=800,
        height=400
    )
    fig_px.update_traces(marker=dict(size=8, opacity=0.7))
    fig_px.update_layout(margin=dict(l=20, r=20, t=20, b=20))
    
    figWidget = go.FigureWidget(fig_px)

    sc = figWidget.data[0]

    sc.on_click(handle_click)
    return plot_data, figWidget

def plot_window(starting_point):
    a = create_window(df, starting_point, m)
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=a))
    fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), height=400)
    fig.show()

def plot_selected():
    if len(click_log) > 0:
        last_click_log = click_log[-1]
        clicked_trace = last_click_log['trace']
        clicked_index = last_click_log['index']
        selected_staring_point = int(clicked_trace['customdata'][clicked_index][0])
        selected_similar_count = int(clicked_trace['customdata'][clicked_index][1])
        selected_rank = int(clicked_trace['customdata'][clicked_index][2])
        plot_window(selected_staring_point)
        print(f'starting point: {selected_staring_point}, similar count: {selected_similar_count}, rank: {selected_rank}')
    else:
        print('select a point on tSNE')

In [None]:
# Extract pattern features
patterns, pattern_info = extract_pattern_features(df, sorted_results[:1000], m=60)

# Calc t-SNE (takes 35 seconds)
tsne_coords = get_tsne(patterns)

In [None]:
# plot tSNE
plot_data, figWidget = visualize_clusters_tsne(tsne_coords, pattern_info)
figWidget

In [None]:
plot_selected() # starting point 770975, 1259400