In [None]:
# Import and load all concated time series into data #

import os
import ast

path = os.getcwd()

vectors = [f for f in os.listdir(path) if (os.path.isfile(os.path.join(path, f)) & f.endswith('.txt'))]
print('Reading {0} vectors...'.format(len(vectors)))

data = list()

useAllFeatures = True;

for r in vectors:
    rPath = path + "/" + r
    name = r.split('.', 1)[0]

    saveFile = open(name + '.txt', 'r')
    line = saveFile.read()
    saveFile.close()
    
    if useAllFeatures:
        multivec = ast.literal_eval(line)
        # concatenating the vectors for different features into one long vector. As each feature has the same length within a single point, they should all be scaled equally
        vec =[*multivec[0], *multivec[1], *multivec[2], *multivec[3], *multivec[4], *multivec[5]];
        data.append(vec);
    else:
        # testing option for single feature
        data.append(ast.literal_eval(line)[0]);
    
path = os.getcwd() + '\\samples'

# Here we get additional samples from /samples/ subdir to validate clustering
# Remove this if no samples
vectors = [f for f in os.listdir(path) if (os.path.isfile(os.path.join(path, f)) & f.endswith('.txt'))]
print('Reading {0} additional samples...'.format(len(vectors)))
samples = list()
sampleNames = list()
for r in vectors:
    rPath = path + "\\" + r
    name = r.split('.', 1)[0]

    saveFile = open(rPath, 'r')
    line = saveFile.read()
    saveFile.close()
    
    multivec = ast.literal_eval(line)
    # concatenating the vectors for different features into one long vector. As each feature has the same length within a single point, they should all be scaled equally
    vec =[*multivec[0], *multivec[1], *multivec[2], *multivec[3], *multivec[4], *multivec[5]];
    samples.append(vec);
    sampleNames.append(name)

print(sampleNames)

In [5]:
from tslearn.clustering import TimeSeriesKMeans
# from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, \
    TimeSeriesResampler
import numpy

In [6]:
# Resampling the vectors (scaling, interpolation) to fit all runs into a n-d vector
n = 100
X_train = TimeSeriesResampler(sz=n).fit_transform(data)
sz = X_train.shape[1]

scaled_samples = TimeSeriesResampler(sz=n).fit_transform(samples)

seed = 0
numpy.random.seed(seed)

In [None]:
# K-means and graph #

import matplotlib.pyplot as plt

troops = ["drones", "zerglings", "roaches", "hydralisks", "mutalisks", "banelings"];
dmetric = "softdtw"
clustercount = 3;

km = TimeSeriesKMeans(n_clusters=clustercount,
                           metric=dmetric,
                           # metric_params={"gamma": .01},
                           verbose=False,
                           random_state=seed,
                           n_jobs=-1)
y_pred = km.fit_predict(X_train)

plt.figure(dpi=1200)

# Render a graph for each cluster
for yi in range(clustercount):
    
    # Graph layout
    plt.subplot(int(clustercount / 3) + 1, 3, 1 + yi)
    # Select the cluster from the data
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    # Plot the mean
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.xticks(numpy.arange(0, sz, sz / 6), [])
    plt.ylim(0, 50)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Troops over game time(" + ", ".join(troops) + ")")

plt.tight_layout()
plt.savefig('cluster.png')
plt.show()

In [20]:
# test clusters against each additional sample
clustered_samples = km.predict(scaled_samples)
i = 0
for name in sampleNames:
    print('"{0}" is in cluster {1}'.format(name, clustered_samples[i] + 1))
    i += 1

"base baneling bust" is in cluster 1
"muta ling bane" is in cluster 1
"roach hydra" is in cluster 1
"roach nydus" is in cluster 3
"zergling baneling" is in cluster 2
"zergling muta" is in cluster 1


In [None]:
# Demonstration used in video. Irrelevant to paper #

import matplotlib.pyplot as plt
lol = [km.cluster_centers_[0].ravel()];
a = TimeSeriesResampler(sz=140).fit_transform(lol)[0];
a = [x+50 for x in a]
b = km.cluster_centers_[1].ravel();
b = TimeSeriesResampler(sz=140).fit_transform(b)[0];
b = [x+2 for x in b]
plt.figure(dpi=1200)
plt.plot(a, 'b-')
plt.plot(b, 'r-')
plt.xlim(0, 140)
plt.yticks(numpy.arange(0, 150, 50), [])
plt.xticks([])
plt.yticks()
plt.ylim(0, 120)
plt.title("After resampling")
plt.show()

In [None]:
# Finding optimal K #

from tslearn.metrics import cdist_dtw
import tslearn.clustering
scores = list()
dmetric = "softdtw"
def kmeans(clustercount):
    global scores
    km = TimeSeriesKMeans(n_clusters=clustercount,
                           metric=dmetric,
                           # metric_params={"gamma": .01},
                           verbose=False,
                           random_state=seed,
                           n_jobs=-1)
    clstrs = km.fit_predict(X_train)

    sil = tslearn.clustering.silhouette_score(X_train, clstrs, metric="softdtw", n_jobs=-1)
    print("(k, sil) = ({0}, {1:.4f})".format(clustercount,sil))
    scores.append(sil)

krange = range(2, 9)
for k in krange:
    kmeans(k)

In [None]:
# Graphing k vs silscore #
import matplotlib.pyplot as plt

k = range(2, 9)
scores = [0.498, 0.510, 0.420, 0.441, 0.414, 0.400, 0.357]

print("(k, sil) = ({0}, {1:.4f})".format(2,scores[0]))
plt.figure(dpi=1200)
plt.xlim(0, 9)
plt.ylim(0, 0.6)
plt.plot(k, scores)
plt.scatter(k, scores)

plt.xlabel("k")
plt.ylabel("silhouette")
plt.title("K-means Cluster Count vs Silhouette Score")
plt.show()