forked from dfm/aas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_data.py
42 lines (35 loc) · 1.13 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import (division, print_function, absolute_import,
unicode_literals)
import json
from math import log
from collections import defaultdict
from aas.data import Dataset
from aas.abstract_parse import words2dict
if __name__ == "__main__":
print("Loading dataset...")
dataset = Dataset("data/abstracts.json")
print("Finished.")
print("Parsing abstracts...")
abstracts = []
corpus = defaultdict(int)
for doc in dataset:
vec = words2dict(doc["words"])
for k, v in vec.iteritems():
corpus[k] += 1
doc["counts"] = vec
abstracts.append(doc)
print("Finished.")
print("Normalizing be IDF...")
d = len(abstracts)
for w in corpus:
corpus[w] = log(d/corpus[w])
for abstract in abstracts:
for w in abstract["counts"]:
abstract["counts"][w] *= corpus[w]
print("Finished.")
print("Saving data file...")
with open("aas/abstracts.json", "w") as f:
json.dump(abstracts, f, sort_keys=True, indent=2,
separators=(",", ": "))