In [1]:

import random
from copy import deepcopy

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import interpolate
from scipy.stats import spearmanr
from sklearn.preprocessing import robust_scale

In [2]:
data = pd.read_csv('data/1.data.csv', index_col=0)

In [3]:
data.head(25)

Unnamed: 0,108221-10193841,108221-10194165,108221-10194399,108221-10194849,108221-10195021,108221-10195045,108221-10196465,108221-10197027,108221-10198179,108221-10198181,...,406709-10198044,406709-10199332,406709-10202755,406709-10276692,406709-10300859,406709-10321261,406709-10369694,406709-10369695,406709-10403427,406709-10431975
2017-01-01,,,,,,,,,,,...,,,,,,,,,,
2017-01-02,,,,,,,,,,,...,,,,,,,,,,
2017-01-03,,,,,,,,,,,...,,,,,,,,,,
2017-01-04,,,,,,,,,,,...,,,,,,,,,,
2017-01-05,,,,,,,,,,,...,,,,,,,,,,
2017-01-06,,,,,,,,,,,...,,,,,,,,,,
2017-01-07,,,,,,,,,,,...,,,,,,,,,,
2017-01-08,,,,,,3.0,,1.0,,,...,,,,,,,,,,
2017-01-09,,,,,,,,,,,...,,,,,,,,,,
2017-01-10,,,,,,,,,,9.0,...,,,,,,,,,,


In [4]:
tempDictionary = {}
for column in list(data):
    tempArray = tuple([str(value) for value in data[column].values])
    
    if tempArray not in tempDictionary.keys():
        tempDictionary[tempArray] = [column]
    
    else:
        tempDictionary[tempArray].append(column)

for key, value in tempDictionary.items():
    
    if len(value) > 1:
        data.drop(value, axis = 1, inplace = True)
        
        value = [subValue.split('|') for subValue in value]
        print(len(value), value)
        
        value = [item for subValue in value for item in subValue]
        data['|'.join(sorted(list(set(value))))] = [float(value) for value in key]
        
        print(data.values.shape)

2 [['108568-10441280'], ['243038-10441280']]
(1087, 3228)
2 [['255386-10441280'], ['363247-10441280']]
(1087, 3227)
6 [['266558-10441280'], ['266571-10441280'], ['266985-10441280'], ['333536-10441280'], ['361279-10441280'], ['361291-10441280']]
(1087, 3222)
2 [['266571-10194399'], ['266571-10275149']]
(1087, 3221)
2 [['273751-10441280'], ['273753-10441280']]
(1087, 3220)
3 [['273775-10194399'], ['273775-10197027'], ['273775-10275149']]
(1087, 3218)
2 [['299860-10393842'], ['330303-10393842']]
(1087, 3217)
2 [['313611-10441280'], ['328999-10441280']]
(1087, 3216)
2 [['332887-10393842'], ['332922-10393842']]
(1087, 3215)
2 [['332920-10393842'], ['355193-10393842']]
(1087, 3214)
2 [['333534-10393842'], ['340907-10393842']]
(1087, 3213)
2 [['351273-10441280'], ['385075-10441280']]
(1087, 3212)
2 [['364995-10441280'], ['370544-10441280']]
(1087, 3211)
2 [['366244-10193841'], ['366244-10200452']]
(1087, 3210)
2 [['374518-10194165'], ['378627-10275149']]
(1087, 3209)
2 [['374518-10194399'], [

In [5]:
columnsToDelete = []
for column in list(data):
    tempArray = [value for value in data[column].values if np.isfinite(value)]
    
    if len(tempArray) == 1:
        columnsToDelete.append(column)

print(len(columnsToDelete))
data.drop(columnsToDelete, axis = 1, inplace = True)

138


In [7]:
ewmParameter = 1

In [8]:
trajectoriesSet, trajectoriesSmoothOriginal, trajectoriesRaw = {}, {}, {}
for column in list(data):
    
    if column == 'index':
        continue
    
    tempDF = data[[column]]
    
    if len(list(set([value for value in tempDF[column].values if np.isfinite(value)]))) == 1:
        continue
    
    tempDF.index = pd.to_datetime(tempDF.index, yearfirst = True)
    
    tempDF = tempDF.resample('W').agg(np.nansum)
    tempDF.replace(0, np.nan, inplace = True)
    
    firstIndex = tempDF.first_valid_index()
    lastIndex = tempDF.last_valid_index()
    tempDFCut = tempDF.loc[firstIndex: lastIndex]
    
    tempDFCut = tempDFCut.ewm(span = np.max([1, len(tempDFCut.index) * ewmParameter])).mean()
    tempDF = tempDF.ewm(span = np.max([1, len(tempDFCut.index) * ewmParameter])).mean()
    
    trajectoriesSet[column] = [list(robust_scale(tempDFCut[column].values))]
    trajectoriesSmoothOriginal[column] = list(tempDF[column].values)
    trajectoriesRaw[column] = list(data[column].fillna(0))

In [9]:
maxLength = max([len(value[0]) for _, value in trajectoriesSet.items()])

In [10]:
trajectoriesSetProcessed = {}
for key, value in trajectoriesSet.items():
    value = value[0]
    
    if len(value) == maxLength:
        trajectoriesSetProcessed[(key,)] = [value]
        continue
    
    oldScale = np.arange(0, maxLength, maxLength / len(value))
    
    oldScale = oldScale[: min([len(oldScale), len(value)])]
    value = value[: min([len(oldScale), len(value)])]
    
    try:
        interpolationFunction = interpolate.interp1d(oldScale, value)
    
    except:
        print(value)
        continue
    
    cutOff = 0
    while True:
        newScale = np.linspace(0, maxLength - cutOff, maxLength)
        
        try:
            value = interpolationFunction(newScale)
            break
        
        except:
            cutOff += 1
    
    trajectoriesSetProcessed[(key,)] = [value]

[0.0]
[0.0]


In [11]:
trajectories = deepcopy(trajectoriesSetProcessed)
trajectoriesValues = np.array([value[0] for value in list(trajectories.values())])
trajectoriesKeys = list(trajectoriesSetProcessed.keys())

In [15]:
dm, _ = spearmanr(trajectoriesValues, axis = 1)
dm = -dm + 1

In [17]:
distanceMatrixDictionary = {}
for index1, filter1 in enumerate(trajectoriesKeys):
    for index2, filter2 in enumerate(trajectoriesKeys):
        unionFilter = filter1 + filter2
        sorted(unionFilter)
        
        if unionFilter not in distanceMatrixDictionary.keys():
            distanceMatrixDictionary[unionFilter] = dm[index1][index2]

In [55]:
THRESHOLD = 0.50

iteration = 1
while True:
    distanceMatrix = np.empty((len(trajectories), len(trajectories),))
    distanceMatrix[:] = np.nan
    
    for index1, (filter1, trajectory1) in enumerate(trajectories.items()):
        tempArray = []
        
        for index2, (filter2, trajectory2) in enumerate(trajectories.items()):
            
            if index1 > index2:
                continue
            
            elif index1 == index2:
                continue
            
            else:
                unionFilter = filter1 + filter2
                sorted(unionFilter)
                
                if unionFilter in distanceMatrixDictionary.keys():
                    distanceMatrix[index1][index2] = distanceMatrixDictionary.get(unionFilter)
                    
                    continue
                
                metric = []
                for subItem1 in trajectory1:
                    length1 = len(subItem1)
                    
                    for subItem2 in trajectory2:                        
                        metric.append(-spearmanr(subItem1, subItem2)[0] + 1)
                
                metric = max(metric)
                
                distanceMatrix[index1][index2] = metric
                distanceMatrixDictionary[unionFilter] = metric
    
    minValue = np.min(list(distanceMatrixDictionary.values()))
    
    print(minValue)
    
    if minValue > THRESHOLD:
        print(minValue, THRESHOLD)
        break
    
    minIndices = np.where(distanceMatrix == minValue)
    minIndices = list(zip(minIndices[0], minIndices[1]))
    
    minIndex = minIndices[0]
    
    filter1 = list(trajectories.keys())[minIndex[0]]
    filter2 = list(trajectories.keys())[minIndex[1]]
    
    trajectory1 = trajectories.get(filter1)
    trajectory2 = trajectories.get(filter2)
    
    unionFilter = filter1 + filter2
    sorted(unionFilter)
    
    trajectoryGroup = trajectory1 + trajectory2
    
    trajectories = {key: value for key, value in trajectories.items()
                    if all(value not in unionFilter for value in key)}
    
    distanceMatrixDictionary = {key: value for key, value in distanceMatrixDictionary.items()
                                if all(value not in unionFilter for value in key)}
    
    trajectories[unionFilter] = trajectoryGroup
    
    print(iteration, 'finished!')
    iteration += 1
    
    if len(list(trajectories.keys())) == 1:
        break

0.0
1 finished!
0.0
2 finished!


KeyboardInterrupt: 

In [32]:
for key, value in trajectories.items():
    print(key, len(value))

('DG 96910 - DOVE MILK CHOC SINGLES 1.44OZ 18CT 12/CA',) 1
('DG 96920 - 3 MUSKETEERS MULTI-PIECE KINGSIZE 6/24CT',) 1
('DG DC 6300 - DOVE MILK CHOC SINGLES 1.44OZ 18CT 12/CA',) 1
('DG DC 6930 - MILKY WAY SIMPLY CRML SNGL 1.91OZ 12/24C', 'DG DC 6140 - M&MS MILK CHOC KINGSIZE 3.14OZ 24CT 6/CA', 'DG DC 6930 - SKITTLES SOURS SINGLES 1.8OZ 24CT 12/CA') 3
('DG DC 6500 - STARBURST ORIGINAL SINGLES 36CT 10/CA', 'DG DC 96970 - STB ORG JELBN BAG 14OZ 12/CS') 2
('DG DC 6800 - MILKY WAY SIMPLY CRML SNGL 1.91OZ 12/24C', 'DG DC 96970 - MILKYWAY SMP CARAMEL FS 6PK 4.42OZ 24CA') 2
('DG DC 6900 - 3 MUSKETEERS MULTI-PIECE KINGSIZE 6/24CT', 'DG DC 6930 - 3 MUSKETEERS MULTI-PIECE KINGSIZE 6/24CT', 'DG DC 6120 - 3 MUSKETEERS MULTI-PIECE KINGSIZE 6/24CT', 'DG DC 6300 - 3 MUSKETEERS MULTI-PIECE KINGSIZE 6/24CT') 4
('DG DC 6900 - TWIX FS 6PK 3.28OZ 24CA', 'DG DC 6000 - TWIX FS 6PK 3.28OZ 24CA', 'DG DC 96970 - 3 MUSKETEERS FS 6PK 2.93OZ 24CA') 3
('DG 96910 - MILKYWAY SMP CARAMEL FS 6PK 4.42OZ 24CA', 'DG DC 613

In [33]:
clusterNames = sorted(list(set(trajectories.keys())))

nameColumn, idColumn = [], []
for clusterIndex, clusterName in enumerate(clusterNames):
    
    if len(clusterName) == 1:
        continue
    
    nameColumn.append('|'.join(clusterName))
    idColumn.append(clusterIndex + 1)

resultDF = pd.DataFrame()
resultDF['name'] = nameColumn
resultDF['id'] = idColumn

resultDF.to_csv('2.1.clusteringWeekly[correlation+interpolation].csv')

In [34]:
for clusterIndex, clusterName in enumerate(clusterNames):
    
    if len(clusterName) == 1:
        continue
    
    figure = make_subplots(rows=3, cols=1)
    colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(len(clusterName))]
    
    value = []
    for subKey in clusterName:
        value.append(trajectoriesSetProcessed.get((subKey, ))[0])
    
    for index, subValue in enumerate(value):
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index], line = dict(width=2.5), line_shape='spline'), row=1, col=1)
    
    #
    
    value = []
    for subKey in clusterName:
        value.append(trajectoriesSmoothOriginal.get(subKey))
    
    for index, subValue in enumerate(value):
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index], line = dict(width=2.5), line_shape='spline'), row=2, col=1)
    
    value = []
    for subKey in clusterName:
        value.append(trajectoriesRaw.get(subKey))
        # value.append(trajectoriesSetProcessed.get((subKey, ))[0])
    
    for index, subValue in enumerate(value):
        figure.add_trace(go.Scatter(x=list(range(0, len(subValue))), y=subValue,
                                    mode='lines', marker_color=colors[index], line = dict(width=2.5), line_shape='spline'), row=3, col=1)
    
    #    
    
    figure.update_layout(title = 'Cluster ' + str(clusterIndex + 1), showlegend=False, height=1200, width=1200)
    # figure.show()
    
    figure.write_image('images/cluster_' + str(clusterIndex + 1) + '.png')
    
    # break