-
Notifications
You must be signed in to change notification settings - Fork 1
/
Part2.py
87 lines (77 loc) · 2.89 KB
/
Part2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup #for finding relevant word
import os
from nltk import PorterStemmer
term_ind = open("F:\\PycharmProjects\\SearchEngine\\files\\term_index.txt", 'w', encoding='utf8', errors='ignore')
term_inf = open("F:\\PycharmProjects\\SearchEngine\\files\\term_info.txt", 'w', encoding='utf8', errors='ignore')
doc_ind = open("F:\\PycharmProjects\\SearchEngine\\files\\doc_index.txt", 'r', encoding='utf8', errors='ignore')
termID = open("F:\\PycharmProjects\\SearchEngine\\files\\termids.txt", mode="r", encoding="utf-8", errors='ignore')
docInd = []
tid = []
row = 0
#read the whole doc_index file
while True:
sentence = doc_ind.readline()
if sentence == '':
break
docInd.append([])
docInd[row] = sentence.split()
row += 1
#read term id file
while True:
sentence = termID.readline()
if sentence == '':
break
sentence = sentence.split()
tid.append(sentence[0])
#print(docInd)
term_occurence = [] #total occurrence of term in corpus
term_in_docs = [] #total no of docs in which term appears
term_frequency = 0
termdoc = 0
for i in range(0, len(tid)):
term_ind.write(str(tid[i]))
previousdoc = 0
term_frequency = 0
termdoc = 0
for j in range(0, row):
if docInd[j][1] == tid[i]:
termdoc += 1
for k in range(2, len(docInd[j])):
term_frequency += 1
if k > 2:
term_ind.write("\t")
result = int(docInd[j][0])-int(previousdoc)
term_ind.write(str(result))
term_ind.write(":")
if result == 0:
term_ind.write(str(int(docInd[j][k])-int(previouspos)))
else:
term_ind.write(docInd[j][k])
previousdoc = docInd[j][0]
previouspos = docInd[j][k]
elif k == 2:
term_ind.write("\t")
term_ind.write(str(int(docInd[j][0])-int(previousdoc)))
term_ind.write(":")
term_ind.write(docInd[j][k])
previousdoc = docInd[j][0]
previouspos = docInd[j][k]
term_occurence.append(term_frequency)
term_in_docs.append(termdoc)
term_ind.write("\n")
term_ind.close()
term_ind = open("F:\\PycharmProjects\\SearchEngine\\files\\term_index.txt", 'r', encoding='utf8', errors='ignore')
offset = 1
for i in range(0, len(tid)):
term_inf.write(tid[i])
term_inf.write("\t")
term_inf.write(str(offset))
term_inf.write("\t")
term_inf.write("\t")
term_inf.write(str(term_occurence[i]))
term_inf.write("\t")
term_inf.write(str(term_in_docs[i]))
term_inf.write("\n")
term_ind.readline()
#offset = 1 + len(term_ind.readline())
offset = 1 + term_ind.tell()