-
Notifications
You must be signed in to change notification settings - Fork 0
/
Indexer.py
172 lines (148 loc) · 7.01 KB
/
Indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pickle
import time
import TermInfo
__dictionary_of_posting_pointers = {
'a':'abc','b':'abc','c':'abc','A':'abc','B':'abc','C':'abc',
'd':'defgh','e':'defgh','f':'defgh','g':'defgh','h':'defgh','D':'defgh','E':'defgh','F':'defgh','G':'defgh','H':'defgh',
'i':'ijklmn', 'j':'ijklmn','k':'ijklmn','l':'ijklmn','m':'ijklmn','n':'ijklmn','I':'ijklmn','J':'ijklmn','K':'ijklmn','L':'ijklmn','M':'ijklmn','N':'ijklmn',
'o':'opqrs','p':'opqrs','q':'opqrs','r':'opqrs','s':'opqrs','O':'opqrs','P':'opqrs','R':'opqrs','S':'opqrs','Q':'opqrs',
't':'tuvwxyz','u':'tuvwxyz','v':'tuvwxyz','w':'tuvwxyz','x':'tuvwxyz','y':'tuvwxyz','z':'tuvwxyz','T':'tuvwxyz','U':'tuvwxyz','V':'tuvwxyz','W':'tuvwxyz','X':'tuvwxyz','Y':'tuvwxyz','Z':'tuvwxyz'
}
main_dictionary = {} # {term : <df , ptr to the first occurrence of this term in posting file> }
__posting_files_path = ""
__current_posting = {}
__current_posting_file_name = ''
posting_abc = {}
posting_defgh = {}
posting_ijklmn = {}
posting_opqrs = {}
posting_tuvwxyz = {}
posting_others = {}
__posting_from_disk = {}
__stem_suffix = ''
__dictionary_of_postings = {'abc': posting_abc,'defgh': posting_defgh, 'ijklmn':posting_ijklmn, 'opqrs': posting_opqrs, 'tuvwxyz': posting_tuvwxyz, 'others': posting_others }
def create_posting_files(stem_suffix):
global __posting_files_path
global __dictionary_of_postings
global __stem_suffix
__stem_suffix = stem_suffix
for key in __dictionary_of_postings:
with open(__posting_files_path + '\\' + str(key) + __stem_suffix, 'wb') as file:
pickle.dump(__dictionary_of_postings[key], file)
file.close()
def insert_to_posting(term, docID_tf_dic):
global __current_posting
post = __current_posting
if term in __current_posting:
tmp_termContentOnPostingFile = __current_posting[term]
tmp_termContentOnPostingFile.update(docID_tf_dic)
else:
__current_posting[term] = docID_tf_dic
def set_path_to_postiong_files(path):
global __posting_files_path
__posting_files_path = path
pass
def calculate_tf(doc_id_tf):
counter = 0
for key in doc_id_tf:
counter = counter + doc_id_tf[key]
return counter
def merge_dictionaries(dictionary): # {term : {doc id : tf}}
global main_dictionary
global __current_posting
global __current_posting_file_name
global __dictionary_of_posting_pointers
for str_term in dictionary:
str_term_0 = str_term[0]
__current_posting = __dictionary_of_postings[__dictionary_of_posting_pointers.get(str_term_0,'others')]
if str_term in main_dictionary: #in dictionary, posting file exists
term_info = main_dictionary[str_term]
term_info.add_df(len(dictionary[str_term]))
term_info.add_tf(calculate_tf(dictionary[str_term]))
insert_to_posting(str_term, dictionary[str_term])
elif str_term.lower() in main_dictionary: #upper case -> insert as lower case
str_term_lower = str_term.lower()
term_info = main_dictionary[str_term_lower]
term_info.add_df(len(dictionary[str_term]))
term_info.add_tf(calculate_tf(dictionary[str_term]))
insert_to_posting(str_term_lower, dictionary[str_term])
elif str_term.islower():
if str_term.upper() in main_dictionary:
term_in_upper = str_term.upper()
term_info = main_dictionary[term_in_upper]
term_info.add_df(len(dictionary[str_term]))
term_info.add_tf(calculate_tf(dictionary[str_term]))
del main_dictionary[term_in_upper]
main_dictionary[str_term] = term_info
if term_in_upper in __current_posting:
current_posting_upper_content = __current_posting[term_in_upper]
del __current_posting[term_in_upper]
__current_posting[str_term] = current_posting_upper_content
insert_to_posting(str_term, dictionary[str_term])
else: # lower case not in dictionary
term_info = TermInfo.TermInfo()
term_info.add_df(len(dictionary[str_term]))
term_info.add_tf(calculate_tf(dictionary[str_term]))
main_dictionary[str_term] = term_info
insert_to_posting(str_term, dictionary[str_term])
else: #upper case not in dictionary
term_info = TermInfo.TermInfo()
term_info.add_df(len(dictionary[str_term]))
term_info.add_tf(calculate_tf(dictionary[str_term]))
main_dictionary[str_term] = term_info
insert_to_posting(str_term, dictionary[str_term])
# merge 2 dictionaries of posting files
# the dictionary we get as argument is the most update by term upper/lower case.
# so we need to deal with it well
def mergePostingsAndSaveToDisk(key): # {term : { doc : term}}
global __dictionary_of_postings
global __posting_from_disk
posting_from_disk = __posting_from_disk
dic = __dictionary_of_postings[key]
for str_term in dic: # the most update term
if str_term in posting_from_disk: # as it is
posting_from_disk[str_term].update(dic[str_term])
elif str_term.upper() in posting_from_disk: # in upper case in the old posting
str_term_upper = str_term.upper()
term_dic = posting_from_disk[str_term_upper]
term_dic.update(dic[str_term])
del posting_from_disk[str_term_upper]
posting_from_disk[str_term] = term_dic # add with the lower case term
else: # not exists
posting_from_disk[str_term] = dic[str_term]
__posting_from_disk=posting_from_disk
write_posting_file_to_disk(key)
def write_posting_file_to_disk(key):
global __posting_files_path
global __posting_from_disk
global __stem_suffix
with open(__posting_files_path + '\\' + str(key) + __stem_suffix, 'wb' ) as file:
pickle.dump(__posting_from_disk, file)
file.close()
def readPosting(key):
global __posting_files_path
global __posting_from_disk
global __stem_suffix
with open(__posting_files_path + '\\' + str(key)+ __stem_suffix, 'rb' ) as file:
__posting_from_disk = pickle.load(file)
file.close()
def SaveAndMergePostings():
global __dictionary_of_postings
for key in __dictionary_of_postings:
readPosting(key)
mergePostingsAndSaveToDisk(key)
__dictionary_of_postings[key].clear()
def reset():
global main_dictionary, __posting_files_path, __current_posting, __current_posting_file_name, posting_abc, posting_defgh, posting_ijklmn
global posting_opqrs, posting_tuvwxyz, posting_others, __posting_from_disk
main_dictionary.clear() # {term : <df , ptr to the first occurrence of this term in posting file> }
__posting_files_path = ""
__current_posting.clear()
__current_posting_file_name = ''
posting_abc.clear()
posting_defgh.clear()
posting_ijklmn.clear()
posting_opqrs.clear()
posting_tuvwxyz.clear()
posting_others.clear()
__posting_from_disk.clear()