-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
66 lines (53 loc) · 2.49 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from dt_learning import DecisionTree
from termcolor import colored
import random
def remove_data(examples, attributes_indexes, p):
for ex in examples:
for attr in attributes_indexes:
if random.uniform(0, 1) <= p:
ex[attr] = ''
return examples
def get_dts_name(dataset):
dts_name = dataset.replace(".data", "").upper()
print("\n", colored("-" * 25, "red"), colored(dts_name, "red"), colored("DATASET", "red"), colored("-" * 25, "red"))
return dts_name
def get_attributes(dataset):
f = open('datasets/names/{}'.format(dataset.replace(".data", ".names")))
attributes = f.readline().split(',') # read first line of data file containing attributes' name
f.close()
if attributes[0] != "Class": # example's classification is named Class in .name file
target_pos = len(attributes) - 1
print(colored("\nAttributes' name:\n", "green"), attributes[:target_pos])
return attributes[:target_pos], target_pos # classification is at the end
else:
target_pos = 0
print(colored("\nAttributes' name:\n", "green"), attributes[target_pos + 1:])
return attributes[target_pos + 1:], target_pos # classification is at the beginning
def get_examples(dataset, target_position):
f = open('datasets/{}'.format(dataset))
examples = f.readlines()
f.close()
classification = []
for i in range(len(examples)):
examples[i] = examples[i].strip() # return a copy of the string where whitespaces are removed
if dataset == 'monks.data':
examples[i] = examples[i].split(' ')
else:
examples[i] = examples[i].split(',') # return a list of words of the string where , is used as separator
classification.append(examples[i][target_position])
if target_position != 0:
examples[i] = examples[i][:target_position] # classification is at the end
else:
examples[i] = examples[i][target_position + 1:] # classification is at the beginning
return examples, classification
def get_attributes_values(examples, attributes_indexes):
attributes_val = [DecisionTree.get_values(examples, attr) for attr in attributes_indexes]
print(colored("Attributes values:\n", "green"), attributes_val)
return attributes_val
def get_target_values(classification):
values = []
for c in classification:
if c not in values:
values.append(c)
print(colored("Target values:\n", "green"), values, "\n")
return values