In [10]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.neighbors import BallTree

# 生成时间序列数据
np.random.seed(0)
X = np.random.randn(100000, 20)
y = np.random.randn(20)

# 计算pearson相关系数矩阵
corr_matrix = np.zeros((X.shape[0],))
for i in range(X.shape[0]):
    corr, _ = pearsonr(X[i], y)
    corr_matrix[i] = corr

# 构建ball tree
tree = BallTree(X, metric='euclidean')


In [11]:

# 查询相似的时间序列
k = 5  # 返回5个最相似的时间序列
distances, indices = tree.query(y.reshape(1, -1), k=k, return_distance=True)

# 打印结果
print("相似度最高的{}个序列：".format(k))
for i in range(k):
    idx = indices[0][i]
    corr = corr_matrix[idx]
    print("序列{}，pearson相关系数为{}，欧氏距离为{}".format(idx, corr, distances[0][i]))

相似度最高的5个序列：
序列21380，pearson相关系数为0.7153313588959008，欧氏距离为2.8319087341686444
序列63124，pearson相关系数为0.7345123265226512，欧氏距离为2.8538328229663183
序列75469，pearson相关系数为0.7117811811682344，欧氏距离为2.8609330662226364
序列42154，pearson相关系数为0.717717381060927，欧氏距离为3.0341871779769183
序列22683，pearson相关系数为0.683495183014961，欧氏距离为3.0910342381034983


In [41]:
def ball_tree(X, y):
    # 计算pearson相关系数矩阵
    corr_matrix = np.zeros((X.shape[0],))
    for i in range(X.shape[0]):
        corr, _ = pearsonr(X[i], y)
        corr_matrix[i] = corr

    # 构建ball tree
    tree = BallTree(X, metric='euclidean')
    return corr_matrix, tree

In [15]:
def normal_by_row(data):

    # 计算每列的均值和标准差
    mean = np.mean(data, axis=1, keepdims=True)
    std = np.std(data, axis=1, keepdims=True)

    # 标准化数据
    data_standardized = (data - mean) / std
    return data_standardized


In [16]:
x_train = np.load("sh510050_train.npy")

In [18]:
x_train_normal = normal_by_row(x_train)

In [22]:
x_test = np.load("sh510050_test.npy")
x_test_normal = normal_by_row(x_test)

In [46]:
cor_mat, tree = ball_tree(x_train, x_test[1000])

In [54]:
len(cor_mat[cor_mat>0.85]) 

0

In [57]:
for xt in x_test[1000:]:
    cor_mat, tree = ball_tree(x_train, xt)
    print(len(cor_mat[cor_mat>0.85]) )

0
0
3
6
7
8
9
10
11
11
12
13
14
15
16
18
19
20
26
29
31
33
33
34
34
34
34
35
35
35


KeyboardInterrupt: 

In [49]:
k = 50  # 返回5个最相似的时间序列
distances, indices = tree.query(x_test[1000].reshape(1, -1), k=k, return_distance=True)

# 打印结果
print("相似度最高的{}个序列：".format(k))
for i in range(k):
    idx = indices[0][i]
    corr = corr_matrix[idx]
    print("序列{}，pearson相关系数为{}，欧氏距离为{}".format(idx, corr, distances[0][i]))

相似度最高的50个序列：
序列18151，pearson相关系数为-0.12683094472925563，欧氏距离为0.8126971145512951
序列18152，pearson相关系数为0.1018752286736739，欧氏距离为0.8127115847580865
序列18150，pearson相关系数为0.18122574946656028，欧氏距离为0.8132695494115104
序列18153，pearson相关系数为-0.212663581968511，欧氏距离为0.8134253991608582
序列18154，pearson相关系数为-0.3648515766318374，欧氏距离为0.814441428219366
序列18149，pearson相关系数为0.26421210310802595，欧氏距离为0.8146611074551192
序列18155，pearson相关系数为-0.007955179394242198，欧氏距离为0.8161035473516832
序列18148，pearson相关系数为0.10006624443073732，欧氏距离为0.8163975502168052
序列18156，pearson相关系数为0.06290927586801941，欧氏距离为0.8178740000758069
序列18147，pearson相关系数为-0.14263921580226804，欧氏距离为0.8187453083835056
序列18157，pearson相关系数为0.019447762145249403，欧氏距离为0.8197299311358596
序列18158，pearson相关系数为0.07025435938304901，欧氏距离为0.8214900121121386
序列18146，pearson相关系数为-0.12626373087649567，欧氏距离为0.8215905793033419
序列18159，pearson相关系数为-0.09040561727255804，欧氏距离为0.8234576613281348
序列18145，pearson相关系数为-0.43483280023907495，欧氏距离为0.8248977876076535
序列18160，pearson相关系数为0.

array([2.5682, 2.567 , 2.5656, 2.5638, 2.5632, 2.5638, 2.5638, 2.564 ,
       2.5642, 2.564 , 2.5684, 2.5744, 2.5822, 2.59  , 2.598 , 2.6018,
       2.6056, 2.609 , 2.6114, 2.6134, 2.6154, 2.615 , 2.6142, 2.6154,
       2.6156, 2.6166, 2.6184, 2.6194, 2.6196, 2.6214, 2.6226, 2.6236,
       2.6252, 2.6254, 2.6262, 2.6264, 2.6274, 2.628 , 2.6302, 2.6312,
       2.6324, 2.6334, 2.6338, 2.6332, 2.6322, 2.6296, 2.6274, 2.6262,
       2.6252, 2.6256, 2.6262, 2.6274, 2.628 , 2.6284, 2.6278, 2.628 ,
       2.6268, 2.6258, 2.6244, 2.6216, 2.6172, 2.6144, 2.611 , 2.6078,
       2.6066, 2.6078, 2.608 , 2.608 , 2.6084, 2.6084, 2.6082, 2.6082,
       2.6092, 2.6106, 2.6114, 2.6118, 2.6122, 2.6126, 2.6124, 2.6128,
       2.6138, 2.6144, 2.6142, 2.6158, 2.6166, 2.6174, 2.6186, 2.6198,
       2.62  , 2.6208, 2.6212, 2.6214, 2.6224, 2.623 , 2.6228, 2.6224,
       2.622 , 2.6216, 2.6218, 2.6218, 2.6222, 2.6226, 2.6228, 2.6228,
       2.6234, 2.6236, 2.6258, 2.627 , 2.6274, 2.628 , 2.6278, 2.6256,
      

In [25]:
X = np.random.randn(10, 20)

In [32]:
X[1]

array([ 0.32109835, -0.72023966,  0.79705145,  0.96982949, -0.05928517,
       -0.26239031,  0.01473281,  0.01069014,  0.26606428,  0.63608734,
       -1.77592115,  1.3669944 , -0.39379399, -0.49758993,  1.06441791,
       -0.09557093,  0.34037208,  0.27619099, -1.71726505,  0.67911003])