/
detectEnglish.py
54 lines (40 loc) · 1.68 KB
/
detectEnglish.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Detect English module
# http://inventwithpython.com/codebreaker (BSD Licensed)
# To use, run:
# import detectEnglish
# detectEnglish.isEnglish(someString) # returns True or False
# (There must be a "dictionary.txt" file in this directory with all English
# words in it, one word per line.)
import re
dictionaryFile = open('dictionary.txt')
ENGLISH_WORDS = {}
for word in dictionaryFile.read().upper().split('\n'):
ENGLISH_WORDS[word] = None
dictionaryFile.close()
nonLettersOrSpacePattern = re.compile('[^A-Z\s]')
nonLettersPattern = re.compile('[^A-Z]')
LETTTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def getEnglishCount(message):
# Returns the amount of words in message that appear in the dictionary.
message = message.upper()
# Use a "regular expression" to get rid of non-letters or spaces from the message.
message = nonLettersOrSpacePattern.sub('', message)
words = message.split()
# Go through each word and see how many are english words.
matches = 0
for word in words:
# If the word exists in ENGLISH_WORDS, then increment the number of
# matches by 1.
if word in ENGLISH_WORDS:
matches += 1
# Return the fraction of matching words out of total words.
return (matches / len(words))
def isEnglish(message, wordPercentage=20):
# By default, 20% of the words must be recognized as English words that
# exist in the dictionary file.
wordPercentage /= 100
# Get the percentage of recognized English words.
englishWords = getEnglishCount(message)
# Get the number of letters in the message.
numLetters = len(nonLettersPattern.sub('', message.upper()))
return (englishWords >= wordPercentage)