-
Notifications
You must be signed in to change notification settings - Fork 1
/
Co-eye.py
359 lines (293 loc) · 13.2 KB
/
Co-eye.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# import necessarily libraries
#!pip install pyts
#!pip install imblearn
import warnings
warnings.filterwarnings("ignore")
from pyts.approximation import SymbolicAggregateApproximation, SymbolicFourierApproximation
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from pyts.preprocessing import StandardScaler
import random
from collections import Counter
from imblearn.over_sampling import BorderlineSMOTE, SMOTE
from sklearn.model_selection import LeaveOneOut
import heapq
from sklearn.model_selection import cross_val_score
import os
# ### Define functions required for Co-eye
#Find the most confident label across lenses in one representation (either SAX or SFA)
def mostConf_one(matrcies, labels):
#Marerices is of length L: number of lenses/forests
#Each matrix in matrices is the probalistic accuracy of test data
#Matrix of size m X n: where m is the number of test instances,n is the number of classes
predLabels=[]
# For each test data
for row in range(len(matrcies[0])):
maxConf= 0
# For each Forest/lense
for mat in matrcies:
for col in range(len(labels)):
if(mat[row][col]>maxConf):
maxConf= mat[row][col]
Conflabel= labels[col]
#Return the most confident lable across lenses
predLabels.append(Conflabel)
return predLabels
# ### Implementation of Vote function described in Section 4.3 in the paper
# #Find the most confident labeles across lenses of the two representations ( SAX and SFA)
def mostConf_multi(matrcies, labels, sfa_n):
#Marerices is of length L: number of lenses/forests (total for SAX and SFA)
#Each matrix in matrices is the probalistic accuracy of test data
#Matrix of size m X n: where m is the number of test instances,n is the number of classes (labels)
#Sax_n number of lenses in SFA
predLabel=[]
multiSFA = False
multiSAX= False
# For each test data, we define the most confident and second-most confident for each presentation, then vote between them
for row in range(len(matrcies[0])):
#Initialise
maxConf_SFA= 0
SFAConf=[]
SFALables= []
SAXConf=[]
SAXLables= []
#Look into SFA only matrices
#Find the max confidence/ prbability in SFA
for mat in matrcies[:sfa_n]:
for col in range(len(labels)):
SFAConf.append(mat[row][col])
SFALables.append(labels[col])
if(mat[row][col]>maxConf_SFA):
maxConf_SFA= mat[row][col]
Conflabel_SFA= labels[col]
#Second best flag
SB1= False
#Special case: when multiple lenses have the same accuracy (most confident)
if(SFAConf.count(maxConf_SFA)>1):
indices = [i for i, x in enumerate(SFAConf) if x == maxConf_SFA]
l= [SFALables[i] for i in indices]
if(len(set(l))!=1):
maxIr=max(l.count(y) for y in set(l))
k= [l.count(y) for y in set(l)]
# Find the most common label
if(k.count (maxIr)==1):
cnt = Counter(l)
Conflabel_SFA= cnt.most_common(2)[0][0]
secondBestSFALabel= cnt.most_common(2)[1][0]
#If no common label, in case of tie, the label is chosen randomly.
else:
shuff=[]
for it in set(l):
if (l.count(it)==maxIr): shuff.append(it)
shuff= list(set(l))
random.shuffle(shuff)
Conflabel_SFA= shuff[0]
secondBestSFALabel= shuff[1]
SB1= True
#Set the second best
secondBestSFA= maxConf_SFA
if(SB1== False):
secondBestSFA= max(n for n in SFAConf if n!=maxConf_SFA)
secondBestSFALabel= SFALables[SFAConf.index(secondBestSFA)]
#Same steps for SAX
maxConf_SAX= 0
SB2= False
for mat in matrcies[sfa_n:]:
for col in range(len(labels)):
SAXConf.append(mat[row][col])
SAXLables.append(labels[col])
if(mat[row][col]>maxConf_SAX):
maxConf_SAX= mat[row][col]
Conflabel_SAX= labels[col]
if(SAXConf.count(maxConf_SAX)>1):
indices = [i for i, x in enumerate(SAXConf) if x == maxConf_SAX]
l= [SAXLables[i] for i in indices]
if(len(set(l))!=1):
print ("Conflict on max confident", l)
maxIr=max(l.count(y) for y in set(l))
k= [l.count(y) for y in set(l)]
# How many equal items with max confident value
if(k.count (maxIr)==1):
cnt = Counter(l)
Conflabel_SAX= cnt.most_common(2)[0][0]
secondBestSAXLabel= cnt.most_common(2)[1][0]
else:
shuff=[]
for it in set(l):
if (l.count(it)==maxIr): shuff.append(it)
print ("tie", shuff)
random.shuffle(shuff)
Conflabel_SAX= shuff[0]
secondBestSAXLabel= shuff[1]
secondBestSAX= maxConf_SAX
# print("most common", Conflabel_SAX, "Second Best", secondBestSAXLabel)
SB2= True
# print("indecies", indices, "labels" ,labels)
if(SB2== False):
secondBestSAX= max(n for n in SAXConf if n!=maxConf_SAX)
secondBestSAXLabel= SAXLables[SAXConf.index(secondBestSAX)]
# print ("Best SAX",maxConf_SAX , Conflabel_SAX)
# print ("Second Best SAX",secondBestSAX , secondBestSAXLabel)
#-----------------------------------
# In case of agreement between most Conf SAX and SFA
if(Conflabel_SAX==Conflabel_SFA):
best= Conflabel_SAX
# If no agreement, then second best is testes
elif(secondBestSAX>secondBestSFA): best= secondBestSAXLabel
else: best= secondBestSFALabel
#Accumulate labels with the best choice
predLabel.append(best)
# print("Best SFA, SAX", Conflabel_SAX, Conflabel_SFA,"Second best SFA, SAX", secondBestSAXLabel, secondBestSFALabel,"Best", best)
# print ("----------------------------------------")
return predLabel
# ### searchLenses [Algorithm 2 (Co-eye paper)]
# Finding best pairs for SFA transformation
def searchLense_SFA(X_train, y_train):
#Input is training data (X_train, y_train)
#Returns selected pairs for SFA transformation based on cross validation
#Set ranges (Seg, alpha) parameters
print("\n Generating SFA parameters....")
maxCoof= 130
if(X_train.shape[1]<maxCoof): maxCoof = X_train.shape[1]-1
if(X_train.shape[1]<100): n_segments=list(range(5, maxCoof,5))
else: n_segments=list(range(10, maxCoof,10))
maxBin= 26
if(X_train.shape[1]<maxBin): maxBin = X_train.shape[1]-2
if(X_train.shape[0]<maxBin): maxBin = X_train.shape[0]-2
alphas= range(3,maxBin)
pairs= []
# Learning parameteres using 5 folds cross validation
for alpha in alphas:
s= []
for seg in n_segments:
SFA= SymbolicFourierApproximation( n_coefs=seg, n_bins= alpha, alphabet= 'ordinal')
X_SFA = SFA.fit_transform(X_train)
scores= 0
RF_clf= RandomForestClassifier(n_estimators=100, random_state=0)
scores = cross_val_score(RF_clf, X_SFA, y_train, cv=5)
s.append(scores.mean())
winner = np.argwhere(s >= np.amax(s)-0.01)
for i in winner.flatten().tolist():
bestCof= n_segments[i]
pairs.append((bestCof, alpha))
print("No of selected pairs: ", len(pairs))
print("SFA pamaeter selection, done!")
return pairs
# In[13]:
# Finding best pairs for SAX transformation
def searchLense_SAX(X_train, y_train):
#Input is training data (X_train, y_train)
#Returns selected pairs for SAX transformation based on cross validation
#Set range (alpha) parameter
print("Generating SAX parameters....")
maxBin= 26
if(X_train.shape[1]<maxBin): maxBin = X_train.shape[1]
alphas= range(3,maxBin)
s= []
pairs= []
#Learning parameters using 5 folds cross validation
for alpha in alphas:
SAX = SymbolicAggregateApproximation(strategy= 'uniform', n_bins=alpha, alphabet= 'ordinal')
X_train_SAX = SAX.fit_transform(X_train)
scores= 0
RF_clf= RandomForestClassifier(n_estimators=100, random_state=0)
scores = cross_val_score(RF_clf, X_train_SAX, y_train, cv=5)
s.append(scores.mean())
winner = np.argwhere(s >= np.mean(s)-0.01)
for i in winner.flatten().tolist():
bestCof= alphas[i]
pairs.append(bestCof)
print("No of selected pairs: ", len(pairs))
print("SAX pamaeter selection, done!")
return pairs
# ### Co-eye Main Run
#Main Run
#Initialisation
All_Acc=[]
SFA_acc=[]
SAX_acc=[]
SFA_pairs= []
SAX_pairs= []
#List names of datasets to use/test
files= ['Coffee', 'Beef']
for f in files:
#Import and balance data
dirname = os.getcwd()
dftrain= pd.read_csv(dirname+'/UCR/'+f+'/'+f+'_TRAIN.tsv',header=None, sep='\t')
dftest= pd.read_csv(dirname+'/UCR/'+f+'/'+f+'_TEST.tsv',header=None, sep='\t')
print("\n-------------------\nDataset: ", f)
#Split data to X_train, y_train and X_test, y_test
X_train= dftrain.drop([0], axis=1).values
y_train= dftrain[0]
X_test= dftest.drop([0], axis=1).values
y_test= dftest[0]
print ( X_train.shape, X_test.shape, y_train.shape, y_test.shape)
#Check for class imbalance
print ("Class distribution ",sorted(Counter(y_train).items()))
min_neighbours= min(Counter(y_train).items(), key=lambda k: k[1])[1]
max_neighbours= max(Counter(y_train).items(), key=lambda k: k[1])[1]
if(min_neighbours==max_neighbours):
print ("Data has an equal distribution among classes \n")
SMOTE_Xtrain= X_train
SMOTE_ytrain= y_train
#Apply SMOTE if data is imbalanced
else:
if (min_neighbours>5): min_neighbours= 6
SMOTE_Xtrain, SMOTE_ytrain = SMOTE(sampling_strategy="all", k_neighbors= min_neighbours-1, random_state=42).fit_resample(X_train, y_train )
print("Class distribution after balance", sorted(Counter(SMOTE_ytrain).items()))
print("---------------------")
print ("After Balance", SMOTE_Xtrain.shape, X_test.shape, SMOTE_ytrain.shape, y_test.shape,"\n")
#Initialise parameters for each dataset
#Random forest probability matrices for all lenses
RFmatrices= []
#Training Phase
#--------------
pairs= searchLense_SFA (SMOTE_Xtrain, SMOTE_ytrain)
for n_coefs, n_bins in pairs:
SFA= SymbolicFourierApproximation( n_coefs=n_coefs, n_bins= n_bins, alphabet= 'ordinal')
#Transform to SFA
X_train_SFA = SFA.fit_transform(SMOTE_Xtrain)
X_test_SFA = SFA.fit_transform(X_test)
# Build RF on each lense
RF_clf= RandomForestClassifier(n_estimators=100, random_state=0)
RF_clf.fit(X_train_SFA,SMOTE_ytrain)
#Store prediction probability for test data
model_pred= RF_clf.predict_proba(X_test_SFA)
#accumulate RFmatrices for a lense
RFmatrices.append(model_pred)
length= len(RFmatrices)
#Best probablistic accuracy using SFA only
acc= accuracy_score(y_test, mostConf_one(RFmatrices,RF_clf.classes_))
SFA_acc.append(acc)
SFA_pairs.append(length)
print("*SFA Correct classification rate: {:2.4f} ".format(acc), "Error rate", 1-acc,"\n")
SAXacc=[]
# Transform to SAX Lenses
saxPairs= searchLense_SAX(SMOTE_Xtrain, SMOTE_ytrain)
for n_bins in saxPairs:
sax = SymbolicAggregateApproximation(strategy= 'uniform', n_bins=n_bins, alphabet= 'ordinal')
#Transform to SAX
X_sax = sax.fit_transform(SMOTE_Xtrain)
X_test_sax = sax.fit_transform(X_test)
#Build RF for each SAX lense
RF_clf= RandomForestClassifier(n_estimators=100, random_state=0)
RF_clf.fit(X_sax,SMOTE_ytrain)
#Store prediction probability for test data
model_pred= RF_clf.predict_proba(X_test_sax)
#Sax only lenses
SAXacc.append(model_pred)
#accumulate RFmatrices for all lenses
RFmatrices.append(model_pred)
#Classification Phase
#---------
saxAcc= accuracy_score(y_test, mostConf_one(SAXacc,RF_clf.classes_))
SAX_acc.append(saxAcc)
SAX_pairs.append(len(SAXacc))
print("*SAX only Correct classification rate: {:2.4f} ".format(saxAcc), "Error rate", 1-saxAcc)
#Combining both SAX and SFA using mostConf_multi method
acc= accuracy_score(y_test, mostConf_multi(RFmatrices,RF_clf.classes_,length))
print("\n Co-eye Correct classification rate: {:2.4f} ".format(acc), "Error rate", 1-acc)
print(" -----------------------------------------")
All_Acc.append(acc)