-
Notifications
You must be signed in to change notification settings - Fork 3
/
gettraindev.py
75 lines (54 loc) · 1.69 KB
/
gettraindev.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#-*-encoding=utf8-*-
import random
import codecs
import sys
import argparse
def opt():
parser = argparse.ArgumentParser()
#trainfiles
parser.add_argument("--infile", type=str, default="")
parser.add_argument("--count",type=int,default=2000)
parser.add_argument("--name",type=str,required=True)
parser.add_argument("--s",type=int,default=0,help="rand")
return parser.parse_args()
if __name__ == "__main__":
opts = opt()
infiles = opts.infile
name = opts.name
count = opts.count
trainfiles = "training/" + name + "_training.utf8"
devfile = "training/" + name + "_dev.utf8"
wordfile = "training/" + name + "_training_words.utf8"
s = opts.s
random.seed(s)
files = codecs.open(infiles , "r" , "utf8")
trainfile = codecs.open(trainfiles, "w","utf8")
devfile = codecs.open(devfile, "w", "utf8")
wordfiles = codecs.open(wordfile,"w","utf8")
lines = files.readlines()
linescous = len(lines)
devlinesids = set()
lastid = -1
while len(devlinesids) < count:
nextid = random.randint(0,linescous-1)
if lastid == -1:
lastid = nextid
elif lastid == nextid:
random.seed(lastid)
continue
devlinesids.add(nextid)
lastid = nextid
#for id in devlinesids:
# devfile.write(lines[id])
for j in range(linescous):
if j in devlinesids:
devfile.write(lines[j])
else:
trainfile.write(lines[j])
line = lines[j].strip("\n").strip().split()
for word in line:
wordfiles.write(word + "\n")
files.close()
trainfile.close()
devfile.close()
wordfiles.close()