-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
115 lines (97 loc) · 3.35 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import imghdr
import math
import os
from warnings import simplefilter
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
simplefilter(action='ignore', category=FutureWarning)
def pad_or_cut(value: np.ndarray, target_length: int):
"""填充或截断一维numpy到固定的长度"""
data_row = None
if len(value) < target_length: # 填充
data_row = np.pad(value, [(0, target_length - len(value))])
elif len(value) > target_length: # 截断
data_row = value[:target_length]
return data_row
def data_split(full_list, ratio, shuffle=True):
n_total = len(full_list)
positive = full_list[full_list.label == 1]
negative = full_list[full_list.label == 0]
n_positive = int(len(positive) * ratio)
n_negative = int(len(negative) * ratio)
if n_total == 0 or n_positive < 1 or n_negative < 1:
return [], full_list
else:
sublist_1_1 = positive.iloc[:n_positive, :]
sublist_1_2 = negative.iloc[:n_negative, :]
sublist_1 = pd.concat([sublist_1_1, sublist_1_2])
sublist_2_1 = positive.iloc[n_positive:, :]
sublist_2_2 = negative.iloc[n_negative:, :]
sublist_2 = pd.concat([sublist_2_1, sublist_2_2])
if shuffle:
sublist_1 = sublist_1.sample(frac=1)
sublist_1.reset_index(inplace=True)
sublist_2.reset_index(inplace=True)
return sublist_1, sublist_2
def check_error_img(path):
original_images = []
for root, dirs, filenames in os.walk(path):
for filename in filenames:
original_images.append(os.path.join(root, filename))
original_images = sorted(original_images)
print('num:', len(original_images))
error_images = []
for filename in tqdm(original_images):
check = imghdr.what(filename)
if check == None:
error_images.append(filename)
if len(error_images) == 0:
print('All images are normal!')
else:
for i in error_images:
print(i)
print('{} error images'.format(len(error_images)))
exit()
def process_notes(notes):
tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')
notes_processed = [[word.lower() for word in tokenizer.tokenize(x) if word not in stop_words] for x in notes]
return notes_processed
def cal_weight(x):
'''熵值法计算变量的权重'''
# 标准化
# x = x.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
# 求k
rows = x.index.size # 行
cols = x.columns.size # 列
k = 1.0 / math.log(rows)
lnf = [[None] * cols for i in range(rows)]
# 矩阵计算--
# 信息熵
# p=array(p)
x = np.array(x)
lnf = [[None] * cols for i in range(rows)]
lnf = np.array(lnf)
for i in range(0, rows):
for j in range(0, cols):
if x[i][j] == 0:
lnfij = 0.0
else:
p = x[i][j] / x.sum(axis=0)[j]
lnfij = math.log(p) * p * (-k)
lnf[i][j] = lnfij
lnf = pd.DataFrame(lnf)
E = lnf
# 计算冗余度
d = 1 - E.sum(axis=0)
# 计算各指标的权重
w = [[None] * 1 for i in range(cols)]
for j in range(0, cols):
wj = d[j] / sum(d)
w[j] = wj
# 计算各样本的综合得分,用最原始的数据
w = pd.DataFrame(w)
return w