-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_data.py
105 lines (82 loc) · 2.68 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import numpy as np
from imblearn.over_sampling import \
ADASYN, BorderlineSMOTE, KMeansSMOTE, \
RandomOverSampler, SMOTE, SMOTENC, SVMSMOTE
from imblearn.under_sampling import \
TomekLinks, RandomUnderSampler, CondensedNearestNeighbour
import os
def __load_data_first_time():
"""Actually read the data, and return it
"""
df: pd.DataFrame = pd.read_csv(
'adult.data',
names=[
'age',
'workclass',
'fnlwgt',
'education',
'education-num',
'martial-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'earnings'
]
)
# Remove unnecessary columns
del df['fnlwgt']
# Split to hidden and visible data
hidden_columns = ['race', 'sex']
hidden_df = df[hidden_columns]
for x in hidden_columns:
del df[x]
# Turning categorical data to numerical
initial_columns = df.columns
for column in initial_columns:
if column == 'earnings':
df[column] = 1.0 * (df[column] == ' >50K')
elif df[column].dtype == 'object':
for unique_value in np.unique(df[column]):
df[unique_value] = 1.0 * (df[column] == unique_value)
del df[column]
else:
df[column] *= 1.0
return df, hidden_df
def load_data(mode: str, normalize: bool = True):
df, hidden_df = __load_data_first_time()
# Extract x and y
y = np.array(df['earnings'].to_numpy(), dtype=int)
del df['earnings']
x = np.array(df.to_numpy(), dtype=float)
# Hidden to numpy
hidden = hidden_df.to_numpy()
if mode == 'vanilla':
pass
elif mode == 'smote':
x, y = SMOTE().fit_sample(x, y)
elif mode == 'adasyn':
x, y = ADASYN().fit_sample(x, y)
elif mode == 'bordersmote':
x, y = BorderlineSMOTE().fit_sample(x, y)
elif mode == 'randomover':
x, y, idxs = RandomOverSampler(return_indices=True).fit_sample(x, y)
hidden = hidden[idxs]
elif mode == 'randomunder':
x, y, idxs = RandomUnderSampler(return_indices=True).fit_sample(x, y)
hidden = hidden[idxs]
elif mode == 'tomek':
x, y, idxs = TomekLinks(return_indices=True).fit_sample(x, y)
hidden = hidden[idxs]
elif mode == 'knn':
x, y, idxs = CondensedNearestNeighbour(return_indices=True, n_neighbors=3).fit_sample(x, y)
hidden = hidden[idxs]
if normalize:
x -= np.mean(x, axis=0)
x /= np.std(x, axis=0)
return x, y, hidden