/
splitDataset.py
55 lines (44 loc) · 1.84 KB
/
splitDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# AUTHORS: Modolo Davide & Veenstra Arno
# DATE: 28 May 2010
# DESCRIPTION: As the test dataset has not been released yet,
# the train dataset is going to be splitted in two parts: train and test sets.
# The function accepts one parameter as input:
# [testSet]: This paramenter can take the following values:
# 0, 1, 2, 3 and 99 (if a 4 is insert it will work as a 0, a 5 as a 1, and so on...)
# Values 0, 1, 2 and 3 split the trial train set: "data_Weps3_Task2_Trial.txt" file
# in four differnt train and test sets
# Value 99 split the new train set: "weps-3_task-2_training.tsv" released few days ago in train and test set.
def split(testSet):
# Use this parametes to decide which line are used in test
# 0 -> line 0,4,8,12 go in test
# 1 -> 3,7,11 etc. go in test
# 2 -> 2,6,10 go in test
# 3 -> 1,5,9 go in test
# Open the datafile
if(testSet != 99):
data = open('data/data_Weps3_Task2_Trial.txt','r')
splitFactor = 4
else:
data = open('data/weps-3_task-2_training.tsv','r')
splitFactor = 10
tempset = open('data/dataset.txt', 'w')
# Create file to save the train set
train = open('data/train.txt', 'w')
# Create file to save the test set
test = open('data/test.txt', 'w')
lineno = 0
datasetList = []
# Skip all examples labelled 'UNKNOWN'
for line in data:
lineList = line.split('\t')
if(lineList[4].strip('\n') != 'UNKNOWN'):
tempset.write(line)
datasetList.append(line)
lineno += 1
for i in range(len(datasetList)):
if((i+testSet) % splitFactor) != 0:
train.write(datasetList[i])
else:
test.write(datasetList[i])
if __name__ == "__main__":
split(0)