In [1]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from dtaidistance.dtw import distance_fast
from joblib import Parallel, delayed

from data_utils import prepare_formated_data
from vectorrepr.datasets.timeseries import TimeSeriesDataset


[32m2025-05-10 17:27:22.336[0m | [1mINFO    [0m | [36mvectorrepr.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/ytqiang/workspace/vectorrepr[0m


In [10]:
train_previous_increase_ratio = np.load("../data/interim/stock500_2023_2024_previous_increase_ratio.npz")["previous_increase_ratio"]
train_post_increase_ratio = np.load("../data/interim/stock500_2023_2024_post_increase_ratio.npz")["post_increase_ratio"]
valid_train_samples = (np.min(train_previous_increase_ratio, axis=1) > 0) & (np.min(train_post_increase_ratio, axis=1) > 0)
train_previous_increase_ratio = train_previous_increase_ratio[valid_train_samples]
train_post_increase_ratio = train_post_increase_ratio[valid_train_samples]
# 数据标签
train_scores = np.max(train_post_increase_ratio, axis=1)
train_labels = np.max(train_post_increase_ratio, axis=1) >= 1.05
print("Shape of train_previous_increase_ratio: ", train_previous_increase_ratio.shape)
print("Shape of train_post_increase_ratio: ", train_post_increase_ratio.shape)
print("Number of positive samples in train data: ", train_labels.sum())
# 构建NearestNeighbors
regressor = NearestNeighbors(n_neighbors=100, metric=f"cosine")
regressor.fit(train_previous_increase_ratio)

selected_train_indices = np.random.choice(train_previous_increase_ratio.shape[0], 100000, replace=False)
train_previous_increase_ratio2 = train_previous_increase_ratio[selected_train_indices]
train_scores2 = train_scores[selected_train_indices]
train_labels2 = train_labels >= 1.05

Shape of train_previous_increase_ratio:  (108998, 30)
Shape of train_post_increase_ratio:  (108998, 5)
Number of positive samples in train data:  17986


In [42]:
today_date = datetime.strptime("2025-05-08", "%Y-%m-%d")
start_date = (today_date - timedelta(days=60)).strftime("%Y-%m-%d")
df = prepare_formated_data(start_date=start_date, end_date=today_date.strftime("%Y-%m-%d"), stock_datapath="../data/external/stock.parquet.gz")
print("today :", today_date, "start_date:", start_date, "lenght of data:", len(df))
predict_dataset = TimeSeriesDataset(
    df,
    time_idx="DateIdx",
    group_ids="Ticker",
    feature_columns=["Open", "High", "Low", "Close", "Volume", "Adj Close"],
    na_handling=-1,
    input_steps=30,
    predict_steps=0,
    return_group_time=True
)

[32m2025-05-11 11:27:23.116[0m | [1mINFO    [0m | [36mdata_utils[0m:[36mprepare_formated_data[0m:[36m24[0m - [1mData prepared: Data shape: (21629, 9)[0m
today : 2025-05-08 00:00:00 start_date: 2025-03-09 lenght of data: 21629


In [43]:
def selected_tickers(predictions, tickers, features, threshold):
    selected_indices = np.where(predictions > threshold)
    predictions = predictions[selected_indices]
    tickers = tickers[selected_indices]
    features = features[selected_indices]

    sorted_index = np.argsort(predictions)[::-1]
    for idx in sorted_index:
        print(tickers[idx])
        print(features[idx])
        print(predictions[idx])

In [44]:
tickers = []
features = []
knncosine_predictions = []
dtw_predictions = []
for i in range(len(predict_dataset)):
    sample, target, group, time = predict_dataset[i]
    if time ==  (today_date - datetime.strptime("2023-01-01", "%Y-%m-%d")).days:
        previous_increase_ratio = sample[:, 3]/sample[0, 3:4]
        knncosine_neighbors = regressor.kneighbors(previous_increase_ratio.reshape(1, -1), n_neighbors=50,return_distance=False)
        knncosine_median_score = np.median(train_scores[knncosine_neighbors])
        knncosine_predictions.append(knncosine_median_score)

        # distances = Parallel(n_jobs=-1)(
        #     delayed(distance_fast)(previous_increase_ratio, y) for y in train_previous_increase_ratio2
        # )
        # # 获取最近的100个样本索引
        # dtw_neighbors = np.argsort(distances)[:100]
        # dtw_predictions.append(np.median(train_scores2[dtw_neighbors]))

        tickers.append(group)
        features.append(previous_increase_ratio)


In [45]:
selected_tickers(np.array(knncosine_predictions), np.array(tickers), np.array(features), 1.058)

WDC
[  1.         -40.61999893 -40.43000031 -40.88999939 -41.77999878
 -34.15000153 -30.54000092 -31.15999985 -31.54999924 -36.29999924
 -34.86999893 -34.40000153 -35.81999969 -35.93000031 -35.61999893
 -36.50999832 -35.95999908 -36.68000031 -37.70000076 -40.16999817
 -40.77999878 -40.90999985 -40.61999893 -43.86000061 -43.95000076
 -44.68999863 -45.02999878 -44.29000092 -44.29999924 -44.29999924]
1.1005000370243696


In [46]:
selected_tickers(np.array(dtw_predictions), np.array(tickers), np.array(features), 1.06)