In [2]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from dtaidistance.dtw import distance_fast
from joblib import Parallel, delayed

from data_utils import prepare_formated_data
from vectorrepr.datasets.timeseries import TimeSeriesDataset


[32m2025-05-13 08:25:23.465[0m | [1mINFO    [0m | [36mvectorrepr.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/ytqiang/workspace/vectorrepr[0m


In [3]:
train_previous_increase_ratio = np.load("../data/interim/stock500_2023_2024_previous_increase_ratio.npz")["previous_increase_ratio"]
train_post_increase_ratio = np.load("../data/interim/stock500_2023_2024_post_increase_ratio.npz")["post_increase_ratio"]
valid_train_samples = (np.min(train_previous_increase_ratio, axis=1) > 0) & (np.min(train_post_increase_ratio, axis=1) > 0)
train_previous_increase_ratio = train_previous_increase_ratio[valid_train_samples]
train_post_increase_ratio = train_post_increase_ratio[valid_train_samples]
# 数据标签
train_scores = np.max(train_post_increase_ratio, axis=1)
train_labels = np.max(train_post_increase_ratio, axis=1) >= 1.05
print("Shape of train_previous_increase_ratio: ", train_previous_increase_ratio.shape)
print("Shape of train_post_increase_ratio: ", train_post_increase_ratio.shape)
print("Number of positive samples in train data: ", train_labels.sum())
# 构建NearestNeighbors
regressor = NearestNeighbors(n_neighbors=100, metric=f"cosine")
regressor.fit(train_previous_increase_ratio)

selected_train_indices = np.random.choice(train_previous_increase_ratio.shape[0], 100000, replace=False)
train_previous_increase_ratio2 = train_previous_increase_ratio[selected_train_indices]
train_scores2 = train_scores[selected_train_indices]
train_labels2 = train_labels >= 1.05

Shape of train_previous_increase_ratio:  (108998, 30)
Shape of train_post_increase_ratio:  (108998, 5)
Number of positive samples in train data:  17986


In [17]:
today_date = datetime.strptime("2025-05-14", "%Y-%m-%d")
start_date = (today_date - timedelta(days=60)).strftime("%Y-%m-%d")
df = prepare_formated_data(start_date=start_date, end_date=today_date.strftime("%Y-%m-%d"), stock_datapath="../data/external/stock.parquet.gz")
print("today :", today_date, "start_date:", start_date, "lenght of data:", len(df))
predict_dataset = TimeSeriesDataset(
    df,
    time_idx="DateIdx",
    group_ids="Ticker",
    feature_columns=["Open", "High", "Low", "Close", "Volume", "Adj Close"],
    na_handling=-1,
    input_steps=30,
    predict_steps=0,
    return_group_time=True
)

[32m2025-05-15 08:07:32.024[0m | [1mINFO    [0m | [36mdata_utils[0m:[36mprepare_formated_data[0m:[36m24[0m - [1mData prepared: Data shape: (21126, 9)[0m
today : 2025-05-14 00:00:00 start_date: 2025-03-15 lenght of data: 21126


In [18]:
def selected_tickers(predictions, tickers, features, threshold):
    selected_indices = np.where(predictions > threshold)
    predictions = predictions[selected_indices]
    tickers = tickers[selected_indices]
    features = features[selected_indices]

    sorted_index = np.argsort(predictions)[::-1]
    for idx in sorted_index:
        print(tickers[idx])
        print(features[idx])
        print(predictions[idx])

In [19]:
tickers = []
features = []
knncosine_predictions = []
dtw_predictions = []
for i in range(len(predict_dataset)):
    sample, target, group, time = predict_dataset[i]
    if time ==  (today_date - datetime.strptime("2023-01-01", "%Y-%m-%d")).days:
        previous_increase_ratio = sample[:, 3]/sample[0, 3:4]
        knncosine_neighbors = regressor.kneighbors(previous_increase_ratio.reshape(1, -1), n_neighbors=50,return_distance=False)
        knncosine_median_score = np.median(train_scores[knncosine_neighbors])
        knncosine_predictions.append(knncosine_median_score)

        distances = Parallel(n_jobs=-1)(
            delayed(distance_fast)(previous_increase_ratio, y) for y in train_previous_increase_ratio2
        )
        # 获取最近的100个样本索引
        dtw_neighbors = np.argsort(distances)[:100]
        dtw_predictions.append(np.median(train_scores2[dtw_neighbors]))

        tickers.append(group)
        features.append(previous_increase_ratio)


In [20]:
selected_tickers(np.array(knncosine_predictions), np.array(tickers), np.array(features), 1.058)

UNH
[1.         1.03295105 1.00353589 1.00286697 1.0571101  1.10569569
 1.13608565 1.14577591 1.12205654 1.11542434 1.11819565 0.8679472
 0.81293956 0.8164755  0.81796633 0.81087536 0.80015291 0.80275227
 0.78216743 0.78639142 0.76582566 0.7643731  0.7737194  0.75403288
 0.74743882 0.73690745 0.72752295 0.72391053 0.59514526 0.58870413]
1.0596205450045073


In [21]:
selected_tickers(np.array(dtw_predictions), np.array(tickers), np.array(features), 1.057)

UNH
[1.         1.03295105 1.00353589 1.00286697 1.0571101  1.10569569
 1.13608565 1.14577591 1.12205654 1.11542434 1.11819565 0.8679472
 0.81293956 0.8164755  0.81796633 0.81087536 0.80015291 0.80275227
 0.78216743 0.78639142 0.76582566 0.7643731  0.7737194  0.75403288
 0.74743882 0.73690745 0.72752295 0.72391053 0.59514526 0.58870413]
1.059756095266728
LLY
[1.         0.9643984  0.90221462 0.88451762 0.88758527 0.92115818
 0.8810711  0.895126   0.92194031 0.92539906 0.89816926 1.02656993
 0.99975563 1.01139059 1.01368827 1.05073209 1.08105401 1.07219331
 1.08186068 1.09866545 0.97052138 1.00659972 1.00395988 0.94732471
 0.94928014 0.91839608 0.89776592 0.92343139 0.91180859 0.87453255]
1.0581989341358868
PLTR
[1.         0.95597486 0.84631223 0.89010862 0.88416241 1.05214414
 1.01303601 1.01257869 1.05911957 1.12521446 1.06014868 1.07238424
 1.03830768 1.0747856  1.1528874  1.23247573 1.28965126 1.31103494
 1.32738715 1.354374   1.3287593  1.42115498 1.41532305 1.24482567
 1.26335057