-
Notifications
You must be signed in to change notification settings - Fork 0
/
eCPDPClassfiler.py
138 lines (106 loc) · 4.23 KB
/
eCPDPClassfiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
import warnings
from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import softmax
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")
class SVD2Classifier(object):
def __init__(self, grid=False):
# Set atttributes
# KNN
# if grid :
# grid = {
# 'n_neighbors' : list(range(1,20)),
# 'weights': ["uniform", "distance"],
# 'algorithm': ['ball_tree', 'kd_tree', 'brute']
# }
# self.ori_clssifier = KNeighborsClassifier()
# self.clf = GridSearchCV(self.ori_clssifier, param_grid=grid,scoring='f1')
# else :
# self.clf = KNeighborsClassifier()
# RF
# if grid :
# grid = {
# 'n_estimators': [10,30,60,100],
# 'criterion' : ['gini', 'entropy'],
# 'min_samples_split': [2 ,4, 6, 8, 16, 20]
# }
# self.ori_clssifier = RandomForestClassifier(class_weight='balanced')
# self.clf = GridSearchCV(self.ori_clssifier, param_grid=grid,scoring='f1')
# else :
# self.clf = RandomForestClassifier(class_weight='balanced')
# NB
# if grid :
# grid = {
# 'var_smoothing' : np.logspace(0,-9, num=50)
# }
# self.ori_clssifier = GaussianNB()
# self.clf = GridSearchCV(self.ori_clssifier, param_grid=grid,scoring='f1')
# else :
# self.clf = GaussianNB()
# SCV
# if grid:
# if grid:
# grid = {
# 'C': [0.001, 0.01, 0.1, 1, 10, 100],
# 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
# }
# self.ori_clssifier = SVC(class_weight='balanced', probability=True)
# self.clf = GridSearchCV(self.ori_clssifier, param_grid=grid, scoring='f1', n_jobs=4)
# else:
# self.clf = SVC(class_weight='balanced')
# LR
if grid :
grid = {
'C' : [100, 10, 1.0, 0.1, 0.01],
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'penalty' : ['none', 'l1', 'l2', 'elasticnet']
}
clssifier = LogisticRegression(class_weight='balanced',n_jobs=4)
self.clf = GridSearchCV(clssifier, param_grid=grid,scoring='balanced_accuracy',n_jobs=4)
else :
self.clf = LogisticRegression(class_weight='balanced',n_jobs=4)
self.is_trained = False
def align_data(self,X, Z):
def get_svd(dataset):
t_Dataset = dataset.T
svdresult = np.linalg.svd(t_Dataset, full_matrices=False)
return (svdresult)
SVDX = get_svd(X)
U = SVDX[0]
D = SVDX[1]
V_T = SVDX[2]
Z_T = Z.T
CX = np.dot(np.diag(D), V_T).T
# Applied at once for convenience, calculation is independent of each target model data.
CZ = np.dot(U.T, Z_T).T
# Align only a target module data
# t_CZ = np.dot(U.T, Z_T[:,0]).T
# print('t_CZ is ', CZ[0,:])
# Check first module data, after calculation at once
# print('CZ is ', CZ[0,:])
# Two resurlts are same
return CX, CZ
def feature_select(self,CX, Y, CZ):
self.clf.fit(CX, Y)
imps = permutation_importance(self.clf, CX, Y, scoring='balanced_accuracy',n_repeats=10, n_jobs=4)
importances = imps.importances_mean
self.clf.coef_ = importances
sel_model = SelectFromModel(self.clf, prefit=True)
trunCX = sel_model.transform(CX)
trunCZ = sel_model.transform(CZ)
result=True
return trunCX, trunCZ, result
def fit(self, trunCX, Y):
self.clf.fit(trunCX, Y)
self.is_trained = True
def predict(self, trunCZ):
probs = self.clf.predict_proba(trunCZ)
preds = self.clf.predict(trunCZ)
# d = self.clf.decision_function(trunCZ)
# probs = softmax(np.c_[-d, d])
return probs, preds