/
slidingWindow.py
96 lines (79 loc) · 2.88 KB
/
slidingWindow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import utility
import networkx as nx
"""
TODO:
-remove further words like 'mr.' or 'mrs.'
-optimize the windowsize
-maybe further preprocessing on top of lemmatization
-test other filters than Noun/Adjective
"""
def textRank(file):
with open(file) as fp:
# Read the text
text = fp.read().lower()
# Work on words
words = word_tokenize(text)
# Work on sentences
#sentences = sent_tokenize(text) (not used)
def preprocess(list_of_strings):
#Clean strings from punctuation and stopwords
clean_strings = [x for x in list_of_strings if x not in [',',':','.','?']]
clean_strings = [x for x in clean_strings if x not in stopwords.words("english")]
#remove additional words
clean_strings = [x for x in clean_strings if x not in ['mr.', 'mrs.']]
#get the pos tags of the words
posTags = utility.posTag(clean_strings)
#stem the words using the pos tags
stemmed_strings = utility.lemmatize(clean_strings, posTags)
#zip the result into [[word1, tag],[word2, tag]...]
result = []
for i in range(len(stemmed_strings)):
result.append([stemmed_strings[i], posTags[i][1]])
return result
def sim(word_pos_stream):
similarities = {}
windowsize = 5
i = 0
while i < len(word_pos_stream) - windowsize - 1:
#get the sliding window
window = word_pos_stream[i: i + windowsize]
if(window[0][0] == "dursleys"):
gogog=0
#filter the sliding window
window = [x[0] for x in window if x[1] in ['NN', 'NNP', 'NNS', 'NNPS', 'JJ', 'JJR', 'JJS']]
for word in window:
if word not in similarities.keys():
similarities[word] = []
for otherWord in window:
if word != otherWord:
if otherWord not in similarities[word]:
similarities[word].append([otherWord, 1])
i += 1
return similarities
def create_graph(list_of_edges):
"""
This function receives a list of edges and
returns a networkx graph
"""
G = nx.Graph()
n = 0
for src, destinations in list_of_edges.items():
for dest in destinations:
try:
G.add_edge(src, dest[0], weight=dest[1])
n += 1
except IndexError:
pass
#print(n)
return G
print("prepocess...")
preprocessedText = preprocess(words)#[0:500])
print("find similarities...")
similarities = sim(preprocessedText)
print("create graph...")
G = create_graph(similarities)
pr = nx.pagerank(G)
return G, pr
# print(sorted(pr))