Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This is an advanced version of time series clustering. every times series has two values: 1) time invariant like features,locations 2)time variant like prices, temperature. And the time variant made up the time series. Time series and features have different characteristics and represent different identification of a system. If we want to cluster the time series we should both measure the time series and features. This is generally called Spatio-temporal clustering method. So in this part, we use the cluster method to forecast the future time variant in the test set.Here are the experimental steps: Firstly, we used the spatio-temporal clustering technique similar to K-means clustering to classify different series in training set into the groups. And finding the central time series like the centroids in k-means which represent the common pattern for each group series. Secondly, we predict the future prices belongs to which groups. Then we use the central time series as the future return to forecast. Finally , we use the evaluation function as our performance criterion
- Loading branch information
Showing
1 changed file
with
168 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
from __future__ import division | ||
import matplotlib.pylab as plt | ||
import numpy as np | ||
import random | ||
import pandas as pd | ||
from scipy import spatial | ||
from sklearn.decomposition import PCA | ||
from sklearn import preprocessing | ||
|
||
class ts_cluster(object): | ||
def __init__(self,num_clust=100): | ||
''' | ||
num_clust is the number of clusters for the k-means algorithm | ||
assignments holds the assignments of data points (indices) to clusters | ||
centroids holds the centroids of the clusters | ||
''' | ||
self.num_clust=num_clust | ||
self.assignments={} | ||
self.centroids=[] | ||
|
||
def compa_clust(self,s1,centroid,w): | ||
centroid_part = centroid[:,:5] | ||
self.assign = pd.Series([[] for i in range(len(centroid))],index=np.arange(len(centroid))) | ||
|
||
for ind,i in enumerate(s1): | ||
min_dist=float('inf') | ||
#closest_clust=None | ||
for c_ind,j in enumerate(centroid_part): | ||
|
||
if self.LB_Keogh(i,j,5)<min_dist: | ||
cur_dist=self.SpatioTemporalDis(i, j, w) | ||
|
||
if cur_dist<min_dist: | ||
min_dist=cur_dist | ||
closest_clust=c_ind | ||
|
||
|
||
self.assign[closest_clust].append(ind) | ||
|
||
print self.assign | ||
#print s1[1] | ||
self.s2 = pd.Series([[] for i in range(len(s1))],index=np.arange(len(s1))) | ||
#print self.s2 | ||
for key in self.assign.index: | ||
for k in self.assign[key]: | ||
self.s2[k] = centroid[key,6:].tolist() | ||
#print self.s2[k] | ||
self.s2 = np.array(self.s2) | ||
return self.s2 | ||
|
||
|
||
|
||
def k_means_clust(self,data,num_iter,w,progress=True): | ||
|
||
''' | ||
k-means clustering algorithm for time series data. dynamic time warping Euclidean distance | ||
used as default similarity measure. | ||
''' | ||
self.centroids=random.sample(data,self.num_clust) | ||
print len(self.centroids) | ||
for n in range(num_iter): | ||
if progress: | ||
print 'iteration '+str(n+1) | ||
|
||
self.assignments={} | ||
for ind,i in enumerate(data): | ||
min_dist=float('inf') | ||
#closest_clust=None | ||
for c_ind,j in enumerate(self.centroids): | ||
if self.LB_Keogh(i,j,5)<min_dist: | ||
cur_dist=self.SpatioTemporalDis(i, j, w) | ||
|
||
if cur_dist<min_dist: | ||
min_dist=cur_dist | ||
closest_clust=c_ind | ||
|
||
if closest_clust in self.assignments: | ||
self.assignments[closest_clust].append(ind) | ||
else: | ||
self.assignments[closest_clust]=[] | ||
|
||
print len(self.assignments) | ||
#recalculate centroids of clusters | ||
for key in self.assignments: | ||
clust_sum=0 | ||
for k in self.assignments[key]: | ||
clust_sum=clust_sum+data[k] | ||
self.centroids[key]=[m/len(self.assignments[key]) for m in clust_sum] | ||
|
||
def get_centroids(self): | ||
return self.centroids | ||
|
||
def get_assignments(self): | ||
return self.assignments | ||
|
||
def plot_centroids(self): | ||
for i in self.centroids: | ||
plt.plot(i) | ||
plt.show() | ||
|
||
def SpatioTemporalDis(self,s1,s2,w): | ||
|
||
f1,f2 = s1[0:2],s2[0:2] | ||
v1,v2 = s1[2:],s2[2:] | ||
disInvari = self.InvarDistance(f1, f2) | ||
disVari = self.DTWDistance(v1, v2, w) | ||
|
||
return np.sqrt(disInvari+disVari) | ||
|
||
|
||
def InvarDistance(self,f1,f2): | ||
'''calculates the invariant features distance using Euclidean distance''' | ||
|
||
dis = spatial.distance.euclidean(f1, f2) | ||
|
||
return dis | ||
|
||
def DTWDistance(self,s1,s2,w=None): | ||
''' | ||
Calculates dynamic time warping Euclidean distance between two | ||
sequences. Option to enforce locality constraint for window w. | ||
''' | ||
DTW={} | ||
|
||
if w: | ||
w = max(w, abs(len(s1)-len(s2))) | ||
|
||
for i in range(-1,len(s1)): | ||
for j in range(-1,len(s2)): | ||
DTW[(i, j)] = float('inf') | ||
|
||
else: | ||
for i in range(len(s1)): | ||
DTW[(i, -1)] = float('inf') | ||
for i in range(len(s2)): | ||
DTW[(-1, i)] = float('inf') | ||
|
||
DTW[(-1, -1)] = 0 | ||
|
||
for i in range(len(s1)): | ||
if w: | ||
for j in range(max(0, i-w), min(len(s2), i+w)): | ||
dist= (s1[i]-s2[j])**2 | ||
DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)]) | ||
else: | ||
for j in range(len(s2)): | ||
dist= (s1[i]-s2[j])**2 | ||
DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)]) | ||
|
||
return (DTW[len(s1)-1, len(s2)-1]) | ||
|
||
def LB_Keogh(self,s1,s2,r): | ||
''' | ||
Calculates LB_Keough lower bound to dynamic time warping. Linear | ||
complexity compared to quadratic complexity of dtw. | ||
''' | ||
LB_sum=0 | ||
for ind,i in enumerate(s1): | ||
|
||
lower_bound=min(s2[(ind-r if ind-r>=0 else 0):(ind+r)]) | ||
upper_bound=max(s2[(ind-r if ind-r>=0 else 0):(ind+r)]) | ||
|
||
if i>upper_bound: | ||
LB_sum=LB_sum+(i-upper_bound)**2 | ||
elif i<lower_bound: | ||
LB_sum=LB_sum+(i-lower_bound)**2 | ||
|
||
return np.sqrt(LB_sum) |