In [1]:

import random
from copy import deepcopy

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import interpolate
from scipy.stats import spearmanr
from sklearn.preprocessing import robust_scale

In [2]:
data = pd.read_csv('data/1.data.csv', index_col=0)

In [3]:
data.head(25)

Unnamed: 0,108221-10193841,108221-10194165,108221-10194399,108221-10194849,108221-10195021,108221-10195045,108221-10196465,108221-10197027,108221-10198179,108221-10198181,...,406709-10198044,406709-10199332,406709-10202755,406709-10276692,406709-10300859,406709-10321261,406709-10369694,406709-10369695,406709-10403427,406709-10431975
2017-01-01,,,,,,,,,,,...,,,,,,,,,,
2017-01-02,,,,,,,,,,,...,,,,,,,,,,
2017-01-03,,,,,,,,,,,...,,,,,,,,,,
2017-01-04,,,,,,,,,,,...,,,,,,,,,,
2017-01-05,,,,,,,,,,,...,,,,,,,,,,
2017-01-06,,,,,,,,,,,...,,,,,,,,,,
2017-01-07,,,,,,,,,,,...,,,,,,,,,,
2017-01-08,,,,,,3.0,,1.0,,,...,,,,,,,,,,
2017-01-09,,,,,,,,,,,...,,,,,,,,,,
2017-01-10,,,,,,,,,,9.0,...,,,,,,,,,,


In [4]:
tempDictionary = {}
for column in list(data):
    tempArray = tuple([str(value) for value in data[column].values])
    
    if tempArray not in tempDictionary.keys():
        tempDictionary[tempArray] = [column]
    
    else:
        tempDictionary[tempArray].append(column)

for key, value in tempDictionary.items():
    
    if len(value) > 1:
        data.drop(value, axis = 1, inplace = True)
        
        value = [subValue.split('|') for subValue in value]
        print(len(value), value)
        
        value = [item for subValue in value for item in subValue]
        data['|'.join(sorted(list(set(value))))] = [float(value) for value in key]
        
        print(data.values.shape)

2 [['108568-10441280'], ['243038-10441280']]
(1087, 3228)
2 [['255386-10441280'], ['363247-10441280']]
(1087, 3227)
6 [['266558-10441280'], ['266571-10441280'], ['266985-10441280'], ['333536-10441280'], ['361279-10441280'], ['361291-10441280']]
(1087, 3222)
2 [['266571-10194399'], ['266571-10275149']]
(1087, 3221)
2 [['273751-10441280'], ['273753-10441280']]
(1087, 3220)
3 [['273775-10194399'], ['273775-10197027'], ['273775-10275149']]
(1087, 3218)
2 [['299860-10393842'], ['330303-10393842']]
(1087, 3217)
2 [['313611-10441280'], ['328999-10441280']]
(1087, 3216)
2 [['332887-10393842'], ['332922-10393842']]
(1087, 3215)
2 [['332920-10393842'], ['355193-10393842']]
(1087, 3214)
2 [['333534-10393842'], ['340907-10393842']]
(1087, 3213)
2 [['351273-10441280'], ['385075-10441280']]
(1087, 3212)
2 [['364995-10441280'], ['370544-10441280']]
(1087, 3211)
2 [['366244-10193841'], ['366244-10200452']]
(1087, 3210)
2 [['374518-10194165'], ['378627-10275149']]
(1087, 3209)
2 [['374518-10194399'], [

In [5]:
columnsToDelete = []
for column in list(data):
    tempArray = [value for value in data[column].values if np.isfinite(value)]
    
    if len(tempArray) == 1:
        columnsToDelete.append(column)

print(len(columnsToDelete))
data.drop(columnsToDelete, axis = 1, inplace = True)

138


In [6]:
ewmParameter = 1

In [7]:
trajectoriesSet, trajectoriesSmoothOriginal, trajectoriesRaw = {}, {}, {}
for column in list(data):
    
    if column == 'index':
        continue
    
    tempDF = data[[column]]
    
    if len(list(set([value for value in tempDF[column].values if np.isfinite(value)]))) == 1:
        continue
    
    tempDF.index = pd.to_datetime(tempDF.index, yearfirst = True)
    
    tempDF = tempDF.resample('W').agg(np.nansum)
    tempDF.replace(0, np.nan, inplace = True)
    
    firstIndex = tempDF.first_valid_index()
    lastIndex = tempDF.last_valid_index()
    tempDFCut = tempDF.loc[firstIndex: lastIndex]
    
    tempDFCut = tempDFCut.ewm(span = np.max([1, len(tempDFCut.index) * ewmParameter])).mean()
    tempDF = tempDF.ewm(span = np.max([1, len(tempDFCut.index) * ewmParameter])).mean()
    
    trajectoriesSet[column] = [list(robust_scale(tempDFCut[column].values))]
    trajectoriesSmoothOriginal[column] = list(tempDF[column].values)
    trajectoriesRaw[column] = list(data[column].fillna(0))

In [8]:
maxLength = max([len(value[0]) for _, value in trajectoriesSet.items()])

In [9]:
trajectoriesSetProcessed = {}
for key, value in trajectoriesSet.items():
    value = value[0]
    
    if len(value) == maxLength:
        trajectoriesSetProcessed[(key,)] = [value]
        continue
    
    oldScale = np.arange(0, maxLength, maxLength / len(value))
    
    oldScale = oldScale[: min([len(oldScale), len(value)])]
    value = value[: min([len(oldScale), len(value)])]
    
    try:
        interpolationFunction = interpolate.interp1d(oldScale, value)
    
    except:
        print(value)
        continue
    
    cutOff = 0
    while True:
        newScale = np.linspace(0, maxLength - cutOff, maxLength)
        
        try:
            value = interpolationFunction(newScale)
            break
        
        except:
            cutOff += 1
    
    trajectoriesSetProcessed[(key,)] = [value]

[0.0]
[0.0]


In [10]:
trajectories = deepcopy(trajectoriesSetProcessed)
trajectoriesValues = np.array([value[0] for value in list(trajectories.values())])
trajectoriesKeys = list(trajectoriesSetProcessed.keys())

In [11]:
dm, _ = spearmanr(trajectoriesValues, axis = 1)
dm = -dm + 1

In [12]:
distanceMatrixDictionary = {}
for index1, filter1 in enumerate(trajectoriesKeys):
    for index2, filter2 in enumerate(trajectoriesKeys):
        unionFilter = filter1 + filter2
        sorted(unionFilter)
        
        if unionFilter not in distanceMatrixDictionary.keys():
            distanceMatrixDictionary[unionFilter] = dm[index1][index2]

In [13]:
THRESHOLD = 0.10

iteration = 1
while True:
    distanceMatrix = np.empty((len(trajectories), len(trajectories),))
    distanceMatrix[:] = np.nan
    
    for index1, (filter1, trajectory1) in enumerate(trajectories.items()):
        tempArray = []
        
        for index2, (filter2, trajectory2) in enumerate(trajectories.items()):
            
            if index1 > index2:
                continue
            
            elif index1 == index2:
                continue
            
            else:
                unionFilter = filter1 + filter2
                sorted(unionFilter)
                
                if unionFilter in distanceMatrixDictionary.keys():
                    distanceMatrix[index1][index2] = distanceMatrixDictionary.get(unionFilter)
                    
                    continue
                
                metric = []
                for subItem1 in trajectory1:
                    length1 = len(subItem1)
                    
                    for subItem2 in trajectory2:                        
                        metric.append(-spearmanr(subItem1, subItem2)[0] + 1)
                
                metric = max(metric)
                
                distanceMatrix[index1][index2] = metric
    
    minValue = np.nanmin(distanceMatrix.flatten())
    
    print(minValue)
    
    if minValue > THRESHOLD:
        print(minValue, THRESHOLD)
        break
    
    minIndices = np.where(distanceMatrix == minValue)
    minIndices = list(zip(minIndices[0], minIndices[1]))
    
    minIndex = minIndices[0]
    
    filter1 = list(trajectories.keys())[minIndex[0]]
    filter2 = list(trajectories.keys())[minIndex[1]]
    
    trajectory1 = trajectories.get(filter1)
    trajectory2 = trajectories.get(filter2)
    
    unionFilter = filter1 + filter2
    sorted(unionFilter)
    
    trajectoryGroup = trajectory1 + trajectory2
    
    trajectories = {key: value for key, value in trajectories.items()
                    if all(value not in unionFilter for value in key)}
    
    distanceMatrixDictionary = {key: value for key, value in distanceMatrixDictionary.items()
                                if all(value not in unionFilter for value in key)}
    
    trajectories[unionFilter] = trajectoryGroup
    
    print(iteration, 'finished!')
    iteration += 1
    
    if len(list(trajectories.keys())) == 1:
        break

0.0
1 finished!
0.0
2 finished!
0.0
3 finished!
0.0
4 finished!
0.0
5 finished!
0.0
6 finished!
0.0
7 finished!
0.0
8 finished!
0.0
9 finished!
0.0
10 finished!
0.0
11 finished!
0.0
12 finished!
0.0
13 finished!
0.0
14 finished!
0.0
15 finished!
0.0
16 finished!
0.0
17 finished!
0.0
18 finished!
0.0
19 finished!
0.0
20 finished!
0.0
21 finished!
0.0
22 finished!
0.0
23 finished!
0.0
24 finished!
0.0
25 finished!
0.0
26 finished!
0.0
27 finished!
0.0
28 finished!
0.0
29 finished!
0.0
30 finished!
0.0
31 finished!
0.0
32 finished!
0.0
33 finished!
0.0
34 finished!
0.0
35 finished!
0.0
36 finished!
0.0
37 finished!
0.0
38 finished!
0.0
39 finished!
0.0
40 finished!
0.0
41 finished!
0.0
42 finished!
0.0
43 finished!
0.0
44 finished!
0.0
45 finished!
0.0
46 finished!
0.0
47 finished!
0.0
48 finished!
0.0
49 finished!
0.0
50 finished!
0.0
51 finished!


KeyboardInterrupt: 

In [19]:
minIndices = np.where(distanceMatrix == minValue)

In [28]:
distanceMatrix

array([[       nan, 0.21178134, 0.12544871, ..., 1.95598052, 1.48101413,
        1.23528972],
       [       nan,        nan, 0.16602594, ..., 1.73059984, 1.48216694,
        1.23837214],
       [       nan,        nan,        nan, ..., 1.9204357 , 1.48211145,
        1.23631156],
       ...,
       [       nan,        nan,        nan, ...,        nan, 0.51784176,
        0.7606006 ],
       [       nan,        nan,        nan, ...,        nan,        nan,
        0.50348375],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan]])

In [22]:
from collections import Counter
print(Counter(list(minIndices)))

TypeError: unhashable type: 'numpy.ndarray'

In [14]:
for key, value in trajectories.items():
    print(key, len(value))

('108221-10193841',) 1
('108221-10194165',) 1
('108221-10194399',) 1
('108221-10194849',) 1
('108221-10195021',) 1
('108221-10195045',) 1
('108221-10196465',) 1
('108221-10197027',) 1
('108221-10198179',) 1
('108221-10198181',) 1
('108221-10198182',) 1
('108221-10199623',) 1
('108221-10199712',) 1
('108221-10199988',) 1
('108221-10200452',) 1
('108221-10200791',) 1
('108221-10201968',) 1
('108221-10202449',) 1
('108221-10202452',) 1
('108221-10202693',) 1
('108221-10202694',) 1
('108221-10203008',) 1
('108221-10203056',) 1
('108221-10203295',) 1
('108221-10275149',) 1
('108221-10297121',) 1
('108221-10304712',) 1
('108221-10307669',) 1
('108221-10328772',) 1
('108221-10367469',) 1
('108221-10377726',) 1
('108221-10395872',) 1
('108221-10408882',) 1
('108221-10428062',) 1
('108260-10202693',) 1
('108260-10202694',) 1
('108290-10193841',) 1
('108290-10194165',) 1
('108290-10194399',) 1
('108290-10194511',) 1
('108290-10195045',) 1
('108290-10197027',) 1
('108290-10199623',) 1
('108290-10

('313611-10199332',) 1
('313611-10199565',) 1
('313611-10199988',) 1
('313611-10201968',) 1
('313611-10202452',) 1
('313611-10202755',) 1
('313611-10276692',) 1
('313611-10300859',) 1
('313611-10304712',) 1
('313611-10307669',) 1
('313611-10321261',) 1
('313611-10328772',) 1
('313611-10367469',) 1
('313611-10369694',) 1
('313611-10369695',) 1
('313611-10377726',) 1
('313611-10395872',) 1
('313611-10403427',) 1
('313611-10428062',) 1
('313611-10431975',) 1
('318431-10195440',) 1
('318431-10196261',) 1
('318431-10196916',) 1
('318431-10197997',) 1
('318431-10198044',) 1
('318431-10199332',) 1
('318431-10199565',) 1
('318431-10202755',) 1
('318431-10276692',) 1
('318431-10300859',) 1
('318431-10321261',) 1
('318431-10369694',) 1
('318431-10369695',) 1
('318431-10403427',) 1
('318431-10431975',) 1
('318794-10193841',) 1
('318794-10194165',) 1
('318794-10194399',) 1
('318794-10194849',) 1
('318794-10195021',) 1
('318794-10195045',) 1
('318794-10196465',) 1
('318794-10197027',) 1
('318794-10

('361201-10395872',) 1
('361201-10403427',) 1
('361201-10428062',) 1
('361201-10431975',) 1
('361203-10194849',) 1
('361203-10195021',) 1
('361203-10195440',) 1
('361203-10196261',) 1
('361203-10196465',) 1
('361203-10196916',) 1
('361203-10197997',) 1
('361203-10198044',) 1
('361203-10198179',) 1
('361203-10198181',) 1
('361203-10198182',) 1
('361203-10199332',) 1
('361203-10199565',) 1
('361203-10199988',) 1
('361203-10201968',) 1
('361203-10202452',) 1
('361203-10202755',) 1
('361203-10276692',) 1
('361203-10300859',) 1
('361203-10304712',) 1
('361203-10307669',) 1
('361203-10321261',) 1
('361203-10328772',) 1
('361203-10367469',) 1
('361203-10369694',) 1
('361203-10369695',) 1
('361203-10377726',) 1
('361203-10395872',) 1
('361203-10403427',) 1
('361203-10428062',) 1
('361203-10431975',) 1
('361279-10194849',) 1
('361279-10195021',) 1
('361279-10195440',) 1
('361279-10196261',) 1
('361279-10196465',) 1
('361279-10196916',) 1
('361279-10197997',) 1
('361279-10198044',) 1
('361279-10

('361727-10202694',) 1
('361727-10203008',) 1
('361727-10203056',) 1
('361727-10203295',) 1
('361727-10266310',) 1
('361727-10275149',) 1
('361727-10297121',) 1
('361727-10323903',) 1
('361727-10331233',) 1
('361727-10336881',) 1
('361727-10408882',) 1
('363247-10193841',) 1
('363247-10194165',) 1
('363247-10194399',) 1
('363247-10194511',) 1
('363247-10194849',) 1
('363247-10195021',) 1
('363247-10195045',) 1
('363247-10196465',) 1
('363247-10197027',) 1
('363247-10198179',) 1
('363247-10198181',) 1
('363247-10198182',) 1
('363247-10199623',) 1
('363247-10199712',) 1
('363247-10199988',) 1
('363247-10200452',) 1
('363247-10200791',) 1
('363247-10201968',) 1
('363247-10202449',) 1
('363247-10202452',) 1
('363247-10202694',) 1
('363247-10203008',) 1
('363247-10203056',) 1
('363247-10203295',) 1
('363247-10266310',) 1
('363247-10275149',) 1
('363247-10297121',) 1
('363247-10304712',) 1
('363247-10307669',) 1
('363247-10323903',) 1
('363247-10328772',) 1
('363247-10331233',) 1
('363247-10

('380102-10307669',) 1
('380102-10328772',) 1
('380102-10367469',) 1
('380102-10377726',) 1
('380102-10395872',) 1
('380102-10428062',) 1
('384012-10193841',) 1
('384012-10194165',) 1
('384012-10194399',) 1
('384012-10194849',) 1
('384012-10195021',) 1
('384012-10195045',) 1
('384012-10196465',) 1
('384012-10197027',) 1
('384012-10198179',) 1
('384012-10198181',) 1
('384012-10198182',) 1
('384012-10199623',) 1
('384012-10199712',) 1
('384012-10199988',) 1
('384012-10200452',) 1
('384012-10200791',) 1
('384012-10201968',) 1
('384012-10202449',) 1
('384012-10202452',) 1
('384012-10202693',) 1
('384012-10203008',) 1
('384012-10203056',) 1
('384012-10203295',) 1
('384012-10275149',) 1
('384012-10297121',) 1
('384012-10304712',) 1
('384012-10307669',) 1
('384012-10328772',) 1
('384012-10367469',) 1
('384012-10377726',) 1
('384012-10395872',) 1
('384012-10408882',) 1
('384012-10428062',) 1
('384023-10193841',) 1
('384023-10194165',) 1
('384023-10194399',) 1
('384023-10194849',) 1
('384023-10

('406709-10198044',) 1
('406709-10199332',) 1
('406709-10276692',) 1
('406709-10300859',) 1
('406709-10321261',) 1
('406709-10369694',) 1
('406709-10369695',) 1
('406709-10403427',) 1
('406709-10431975',) 1
('366244-10193841|366244-10200452',) 1
('263733-10431975', '348730-10431975') 2
('266571-10195045', '266571-10202694') 2
('318794-10202694', '318799-10202694') 2
('332876-10195319', '355193-10195319') 2
('333534-10195319', '340907-10195319') 2
('366244-10194165', '366244-10297121') 2
('374518-10408882', '378627-10408882') 2
('378627-10197027', '273775-10203295', '374518-10275149') 3
('378627-10199712', '374518-10199712', '374518-10202449') 3
('389153-10196261', '389153-10276692') 2
('391954-10431975', '401583-10431975') 2
('401583-10197997', '401583-10198044') 2
('266571-10193841', '266571-10194165', '366244-10203008', '366244-10203056') 4
('328996-10201968', '328996-10304712', '328996-10377726', '328996-10195021', '328996-10196465') 5
('401583-10321261', '401583-10369695', '401583-

In [16]:
clusterNames = sorted(list(set(trajectories.keys())))

nameColumn, idColumn = [], []
for clusterIndex, clusterName in enumerate(clusterNames):
    
    if len(clusterName) == 1:
        continue
    
    nameColumn.append('|'.join(clusterName))
    idColumn.append(clusterIndex + 1)

resultDF = pd.DataFrame()
resultDF['name'] = nameColumn
resultDF['id'] = idColumn

resultDF.to_csv('2.1.clusteringWeekly[correlation+interpolation].csv')

In [None]:
for clusterIndex, clusterName in enumerate(clusterNames):
    
    if len(clusterName) == 1:
        continue
    
    figure = make_subplots(rows=3, cols=1)
    colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(len(clusterName))]
    
    value = []
    for subKey in clusterName:
        value.append(trajectoriesSetProcessed.get((subKey, ))[0])
    
    for index, subValue in enumerate(value):
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index], line = dict(width=2.5), line_shape='spline'), row=1, col=1)
    
    #
    
    value = []
    for subKey in clusterName:
        value.append(trajectoriesSmoothOriginal.get(subKey))
    
    for index, subValue in enumerate(value):
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index], line = dict(width=2.5), line_shape='spline'), row=2, col=1)
    
    value = []
    for subKey in clusterName:
        value.append(trajectoriesRaw.get(subKey))
        # value.append(trajectoriesSetProcessed.get((subKey, ))[0])
    
    for index, subValue in enumerate(value):
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index], line = dict(width=2.5), line_shape='spline'), row=3, col=1)
    
    #    
    
    figure.update_layout(title = 'Cluster ' + str(clusterIndex + 1), showlegend=False, height=1200, width=1200)
    figure.show()
    
    # figure.write_image('images/cluster_' + str(clusterIndex + 1) + '.png')
    
    # break