-
Notifications
You must be signed in to change notification settings - Fork 0
/
ReadFile.py
110 lines (92 loc) · 3.49 KB
/
ReadFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from City import City
from DocumentInfo import DocumentInfo
docs_dictionary = {} #Doc Number will be the key. value is a Document
dic_to_parse = {}
city_dictionary = {}
lang_list = []
current_doc =""
current_DOCNO=""
current_CITY =""
current_DATE=""
current_LANG=""
__punctuations_set = {'[', '(', '{', '`', ')', '<', '|', '&', '~', '+', '^', '@', '*', '?', '.',
'>', ';', '_', '\'', ':', ']', '\\', "}", '!', '=', '#', ',', '\"','-','/'}
def __extractDOCNO():
global current_DOCNO
current_DOCNO = (current_doc.split("</DOCNO>", 1)[0]).split("<DOCNO>")[1].strip()
def __extractCITY():
global current_CITY, current_DOCNO
global city_dictionary
if "<F P=104>" in current_doc:
current_CITY = current_doc.split("<F P=104>")[1].split()[0]
if current_CITY != '</F>' and current_CITY.isalpha():
current_CITY = current_CITY.upper()
if current_CITY not in city_dictionary:
city_dictionary[current_CITY] = City(current_CITY,current_DOCNO)
else:
city_object = {current_DOCNO : ['TAG']}
city_dictionary[current_CITY].dic_doc_index.update(city_object)
else:
current_CITY = ""
def __extractDATE():
global current_DATE
if "<DATE1>" in current_doc:
current_DATE = (current_doc.split("</DATE1>", 1)[0]).split("<DATE1>")[1].strip()
elif "<DATE>" in current_doc:
current_DATE = (current_doc.split("</DATE>", 1)[0]).split("<DATE>")[1].strip()
def __extractTEXT():
global current_doc
if "</TEXT>" in current_doc:
text = (current_doc.split("</TEXT>", 1)[0]).split("<TEXT>")[1].strip()
else:
text = ""
dic_to_parse[current_DOCNO] = text
def clean_term_from_punctuations(term):
length = term.__len__()
while length > 0 and term[len(term)-1] in __punctuations_set:
term = term[:-1]
length -= 1
while length > 0 and term[0] in __punctuations_set:
term = term[1:]
length -= 1
return term
def __extractLANG():
global current_doc, __punctuations_set
try:
if '<F P=105>' in current_doc:
current_LANG = current_doc.split("<F P=105>")[1].split()[0]
current_LANG = clean_term_from_punctuations(current_LANG)
current_LANG = current_LANG.lower()
if current_LANG.isdigit():
current_LAN=''
else:
current_LANG = current_LANG[0].upper() + current_LANG[1:]
if current_LANG not in lang_list:
lang_list.append(current_LANG)
except:
current_LANG=''
def takeDocsInfoFromOneFile(path):
global current_doc, current_DATE, current_CITY, current_DOCNO,docs_dictionary
file = open(path, 'r')
text_of_file = "".join(file.readlines())
list_of_docs = text_of_file.split('</DOC>')
del list_of_docs[-1] #not necessary
for doc in list_of_docs:
current_doc = doc
__extractDOCNO()
__extractCITY()
__extractDATE()
__extractLANG()
__extractTEXT()
docs_dictionary[current_DOCNO] = DocumentInfo(current_DATE, current_CITY, str(path), current_LANG)
def reset():
global docs_dictionary, dic_to_parse, city_dictionary, current_CITY, current_doc, current_DATE, current_DOCNO, current_LANG, lang_list
docs_dictionary.clear()
dic_to_parse.clear()
city_dictionary.clear()
lang_list.clear()
current_doc = ""
current_DOCNO = ""
current_CITY = ""
current_DATE = ""
current_LANG = ""