Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split out functions into statistics package.
- Loading branch information
Showing
9 changed files
with
170 additions
and
170 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,8 @@ | |
cmudict | ||
spacy | ||
numpy | ||
pyphen | ||
fire | ||
nltk | ||
# Development | ||
black | ||
pytest | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import cmudict | ||
import string | ||
from spacy import tokens | ||
from typing import Dict, Set | ||
from curses.ascii import isdigit | ||
|
||
# Average reading speeds of people. | ||
CHILD_WPM = 180 | ||
ADULT_WPM = 265 | ||
# Handy for syllable counting. | ||
phoneme_dict: Dict = cmudict.dict() | ||
# A transition that replaces dashes with spaces and removes all special | ||
# characters except for punctuation. | ||
clean_trans = str.maketrans("-", " ", '"#$%&()*+,/:;<=>@[\\]^_`{|}~') | ||
# A transition that removes punctiation. | ||
punct_trans = str.maketrans("", "", ".!?") | ||
# Don't forget the y | ||
vowels: Set = set("aeiouy") | ||
|
||
|
||
def adult_reading_time(text: str) -> int: | ||
"""Calculates the reading time of the text by an average adult in minutes""" | ||
return int(count_words(text) / ADULT_WPM) | ||
|
||
|
||
def child_reading_time(text: str) -> int: | ||
"""Calclulates the reading time of the text by a child in minutes""" | ||
return int(count_words(text) / CHILD_WPM) | ||
|
||
|
||
def count_syllables(doc: tokens.doc.Doc) -> int: | ||
""" | ||
Count all the syllables in piece of text. Try to use the cmu phoneme | ||
Dictionary if possible otherwise fall back to naive algorithm that is | ||
not as accurate. | ||
""" | ||
syllable_count = 0 | ||
for word in doc: | ||
clean_word = word.lower() | ||
if phoneme_dict.get(clean_word): | ||
syllable_count += cmu_syllables_in_word(clean_word) | ||
else: | ||
syllable_count += naive_syllables_in_word(clean_word) | ||
return syllable_count | ||
|
||
|
||
def cmu_syllables_in_word(word: str) -> int: | ||
""" | ||
Look up the word in the cmu phoneme dictionary and use the first option | ||
count the hard vowels that end in a digit. | ||
""" | ||
return len([ph for ph in phoneme_dict[word][0] if ph.strip(string.ascii_letters)]) | ||
|
||
|
||
def naive_syllables_in_word(word: str) -> int: | ||
""" | ||
Use the number of vowels in a word as a proxy for the number of syllables. | ||
""" | ||
return len([char for char in word if char in vowels]) | ||
|
||
|
||
def count_letters(text: str) -> int: | ||
"""Counts the number of letter in a document.""" | ||
return len(text.translate(punct_trans)) - text.count(" ") - text.count("\n") | ||
|
||
|
||
def count_words(text: str) -> int: | ||
"""Counts the number of words in a document.""" | ||
return len(text.split()) | ||
|
||
|
||
def count_sentences(text: str) -> int: | ||
"""Counts the number of sentences in a document.""" | ||
return text.count(".") + text.count("!") + text.count("?") | ||
|
||
|
||
def count_paragraphs(text: str) -> int: | ||
"""Counts the number of paragraphs in a document.""" | ||
return text.count("\n\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from .counters import * | ||
from spacy import tokens | ||
|
||
|
||
def coleman_liau_index(text: str) -> float: | ||
""" | ||
Similar to the Automated Readability Index, but unlike most of the other | ||
grade-level predictors, the Coleman–Liau relies on characters instead of | ||
syllables per word. Instead of using syllable/word and sentence length | ||
indices, Meri Coleman and T. L. Liau believed that computerized assessments | ||
understand characters more easily and accurately than counting syllables | ||
and sentence length. | ||
CLI = (0.0588 x L) - (0.296 x S) - 15.8 | ||
CLI: Coleman Liau Index | ||
L: average number of Letters per 100 words | ||
S: average number of sentences per 100 words | ||
""" | ||
words = text.split() | ||
word_groups = ["".join(words[i : i + 100]) for i in range(0, len(words), 100)] | ||
letter_count_list = [len(l) for l in word_groups] | ||
l = sum(letter_count_list) / len(letter_count_list) | ||
sentence_count_list = [count_sentences(s) for s in word_groups] | ||
s = sum(sentence_count_list) / len(sentence_count_list) | ||
return (0.0588 * l) - (0.296 * s) - 15.8 | ||
|
||
|
||
def flesch_reading_ease(text: str, doc: tokens.Doc) -> float: | ||
""" | ||
The Flesch Reading Ease Formula is a simple approach to assess the | ||
grade-level of the reader. It’s also one of the few accurate measures | ||
around that we can rely on without too much scrutiny. This formula is best | ||
used on school text. It has since become a standard readability formula | ||
used by many US Government Agencies, including the US Department of | ||
Defense. However, primarily, we use the formula to assess the difficulty | ||
of a reading passage written in English. | ||
RE = 206.835 – (1.015 x ASL) – (84.6 x ASW) | ||
RE: Reading Ease | ||
ASL: Average Sentence Length | ||
ASW: Average number of Syllables per Word | ||
""" | ||
asl = count_words(text) / count_sentences(text) | ||
asw = count_syllables(doc) / count_words(text) | ||
return 206.835 - (1.015 * asl) - (84.6 * asw) | ||
|
||
|
||
def flesch_kincaid_reading_age(text: str, doc: tokens.Doc) -> float: | ||
""" | ||
Flesch Kincaid reading age is based on the Flesch Reading Ease formula | ||
but corrosponds to grade school levels in the United States. | ||
FKRA = (0.39 x ASL) + (11.8 x ASW) - 15.59 | ||
FKRA: Flesch Kincaid Reading Age | ||
ASL: Average Sentence Length | ||
ASW: Average number of Syllables per Word | ||
""" | ||
asl = count_words(text) / count_sentences(text) | ||
asw = count_syllables(doc) / count_words(text) | ||
return (0.39 * asl) + (11.8 * asw) - 15.59 | ||
|
||
|
||
def automated_readablitity_index(text: str) -> float: | ||
""" | ||
The Automated Readability Index is derived from ratios representing word | ||
difficulty (number of letters per word) and sentence difficulty (number of | ||
words per sentence). The first consideration in developing the Automated | ||
Readability Index was to establish that the factors used relate to those | ||
found in other indices. The factor relating to sentence structure (average | ||
number of words per sentence) is identical to that found in most currently | ||
used indices , such as the Coleman-Liau Index, so no verification was | ||
required. The verification of the relationship between the word structure | ||
factor was also virtually self-evident. | ||
ARI = 4.71 * (L/W) + 0.5 * (W/S) - 21.43 | ||
ARI: automated Readability Index | ||
L: number of letters | ||
W: number of words | ||
S: number of sentences | ||
""" | ||
l = count_letters(text) | ||
w = count_words(text) | ||
s = count_sentences(text) | ||
return 4.71 * (l / w) + 0.5 * (w / s) - 21.43 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from . import counters | ||
|
||
|
||
def test_naive_syllables_in_word(): | ||
assert counters.naive_syllables_in_word("hello") == 2 | ||
assert counters.naive_syllables_in_word(";") == 0 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters