-
Notifications
You must be signed in to change notification settings - Fork 96
/
analysis.py
35 lines (27 loc) · 1.08 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
import string
import Stemmer
# top 25 most common words in English and "wikipedia":
# https://en.wikipedia.org/wiki/Most_common_words_in_English
STOPWORDS = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
STEMMER = Stemmer.Stemmer('english')
def tokenize(text):
return text.split()
def lowercase_filter(tokens):
return [token.lower() for token in tokens]
def punctuation_filter(tokens):
return [PUNCTUATION.sub('', token) for token in tokens]
def stopword_filter(tokens):
return [token for token in tokens if token not in STOPWORDS]
def stem_filter(tokens):
return STEMMER.stemWords(tokens)
def analyze(text):
tokens = tokenize(text)
tokens = lowercase_filter(tokens)
tokens = punctuation_filter(tokens)
tokens = stopword_filter(tokens)
tokens = stem_filter(tokens)
return [token for token in tokens if token]