Skip to content

Commit

Permalink
Split out functions into statistics package.
Browse files Browse the repository at this point in the history
  • Loading branch information
c11z committed Oct 22, 2018
1 parent d5a69c2 commit a1a4727
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 170 deletions.
Binary file added data/original_moby_dick.doc
Binary file not shown.
164 changes: 0 additions & 164 deletions main.py
Expand Up @@ -12,170 +12,6 @@
import fire
import logging
import moby
import string
import cmudict # type: ignore
import spacy
from spacy import tokens
from typing import Tuple
from curses.ascii import isdigit
from typing import Dict, Set


# Average reading speed of an adult
CHILD_WPM = 180
ADULT_WPM = 265

phoneme_dict: Dict = cmudict.dict()
vowels: Set = set("aeiouy")

# A transition that replaces dashes with spaces and removes all special
# characters except for punctuation.
clean_trans = str.maketrans("-", " ", '"#$%&()*+,/:;<=>@[\\]^_`{|}~')
# A transition that removes punctiation.
punct_trans = str.maketrans("", "", ".!?")


def count_syllables(doc: tokens.doc.Doc) -> int:
"""
Count all the syllables in piece of text. Try to use the cmu phoneme
Dictionary if possible otherwise fall back to naive algorithm that is
not as accurate.
"""
syllable_count = 0
for word in doc:
clean_word = word.lower()
if phoneme_dict.get(clean_word):
syllable_count += cmu_syllables_in_word(clean_word)
else:
syllable_count += naive_syllables_in_word(clean_word)
return syllable_count


def cmu_syllables_in_word(word: str) -> int:
"""
Look up the word in the cmu phoneme dictionary and use the first option
count the hard vowels that end in a digit.
"""
return len([ph for ph in phoneme_dict[word][0] if ph.strip(string.ascii_letters)])


def naive_syllables_in_word(word: str) -> int:
"""
Use the number of vowels in a word as a proxy for the number of syllables.
"""
return len([char for char in word if char in vowels])


def automated_readablitity_index(text: str) -> float:
"""
The Automated Readability Index is derived from ratios representing word
difficulty (number of letters per word) and sentence difficulty (number of
words per sentence). The first consideration in developing the Automated
Readability Index was to establish that the factors used relate to those
found in other indices. The factor relating to sentence structure (average
number of words per sentence) is identical to that found in most currently
used indices , such as the Coleman-Liau Index, so no verification was
required. The verification of the relationship between the word structure
factor was also virtually self-evident.
ARI = 4.71 * (L/W) + 0.5 * (W/S) - 21.43
ARI: automated Readability Index
L: number of letters
W: number of words
S: number of sentences
"""
l = count_letters(text)
w = count_words(text)
s = count_sentences(text)
return 4.71 * (l / w) + 0.5 * (w / s) - 21.43


def coleman_liau_index(text: str) -> float:
"""
Similar to the Automated Readability Index, but unlike most of the other
grade-level predictors, the Coleman–Liau relies on characters instead of
syllables per word. Instead of using syllable/word and sentence length
indices, Meri Coleman and T. L. Liau believed that computerized assessments
understand characters more easily and accurately than counting syllables
and sentence length.
CLI = (0.0588 x L) - (0.296 x S) - 15.8
CLI: Coleman Liau Index
L: average number of Letters per 100 words
S: average number of sentences per 100 words
"""
words = text.split()
word_groups = ["".join(words[i : i + 100]) for i in range(0, len(words), 100)]
letter_count_list = [len(l) for l in word_groups]
l = sum(letter_count_list) / len(letter_count_list)
sentence_count_list = [count_sentences(s) for s in word_groups]
s = sum(sentence_count_list) / len(sentence_count_list)
return (0.0588 * l) - (0.296 * s) - 15.8


def flesch_reading_ease(text: str) -> float:
"""
The Flesch Reading Ease Formula is a simple approach to assess the
grade-level of the reader. It’s also one of the few accurate measures
around that we can rely on without too much scrutiny. This formula is best
used on school text. It has since become a standard readability formula
used by many US Government Agencies, including the US Department of
Defense. However, primarily, we use the formula to assess the difficulty
of a reading passage written in English.
RE = 206.835 – (1.015 x ASL) – (84.6 x ASW)
RE: Reading Ease
ASL: Average Sentence Length
ASW: Average number of Syllables per Word
"""
asl = count_words(text) / count_sentences(text)
asw = count_syllables(tokens.Doc()) / count_words(text)
return 206.835 - (1.015 * asl) - (84.6 * asw)


def flesch_kincaid_reading_age(text: str) -> float:
"""
Flesch Kincaid reading age is based on the Flesch Reading Ease formula
but corrosponds to grade school levels in the United States.
FKRA = (0.39 x ASL) + (11.8 x ASW) - 15.59
FKRA: Flesch Kincaid Reading Age
ASL: Average Sentence Length
ASW: Average number of Syllables per Word
"""
asl = count_words(text) / count_sentences(text)
asw = count_syllables(tokens.Doc()) / count_words(text)
return (0.39 * asl) + (11.8 * asw) - 15.59


def calculate_adult_reading_time(text: str) -> int:
"""Calculates the reading time of the text by an average adult in minutes"""
return int(count_words(text) / ADULT_WPM)


def calculate_child_reading_time(text: str) -> int:
"""Calclulates the reading time of the text by a child in minutes"""
return int(count_words(text) / CHILD_WPM)


def count_letters(text: str) -> int:
"""Counts the number of letter in a document."""
return len(text.translate(punct_trans)) - text.count(" ") - text.count("\n")


def count_words(text: str) -> int:
"""Counts the number of words in a document."""
return len(text.split())


def count_sentences(text: str) -> int:
"""Counts the number of sentences in a document."""
return text.count(".") + text.count("!") + text.count("?")


def count_paragraphs(text: str) -> int:
"""Counts the number of paragraphs in a document."""
return text.count("\n\n")


def cache() -> None:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -2,8 +2,8 @@
cmudict
spacy
numpy
pyphen
fire
nltk
# Development
black
pytest
Expand Down
Empty file added statistics/__init__.py
Empty file.
79 changes: 79 additions & 0 deletions statistics/counters.py
@@ -0,0 +1,79 @@
import cmudict
import string
from spacy import tokens
from typing import Dict, Set
from curses.ascii import isdigit

# Average reading speeds of people.
CHILD_WPM = 180
ADULT_WPM = 265
# Handy for syllable counting.
phoneme_dict: Dict = cmudict.dict()
# A transition that replaces dashes with spaces and removes all special
# characters except for punctuation.
clean_trans = str.maketrans("-", " ", '"#$%&()*+,/:;<=>@[\\]^_`{|}~')
# A transition that removes punctiation.
punct_trans = str.maketrans("", "", ".!?")
# Don't forget the y
vowels: Set = set("aeiouy")


def adult_reading_time(text: str) -> int:
"""Calculates the reading time of the text by an average adult in minutes"""
return int(count_words(text) / ADULT_WPM)


def child_reading_time(text: str) -> int:
"""Calclulates the reading time of the text by a child in minutes"""
return int(count_words(text) / CHILD_WPM)


def count_syllables(doc: tokens.doc.Doc) -> int:
"""
Count all the syllables in piece of text. Try to use the cmu phoneme
Dictionary if possible otherwise fall back to naive algorithm that is
not as accurate.
"""
syllable_count = 0
for word in doc:
clean_word = word.lower()
if phoneme_dict.get(clean_word):
syllable_count += cmu_syllables_in_word(clean_word)
else:
syllable_count += naive_syllables_in_word(clean_word)
return syllable_count


def cmu_syllables_in_word(word: str) -> int:
"""
Look up the word in the cmu phoneme dictionary and use the first option
count the hard vowels that end in a digit.
"""
return len([ph for ph in phoneme_dict[word][0] if ph.strip(string.ascii_letters)])


def naive_syllables_in_word(word: str) -> int:
"""
Use the number of vowels in a word as a proxy for the number of syllables.
"""
return len([char for char in word if char in vowels])


def count_letters(text: str) -> int:
"""Counts the number of letter in a document."""
return len(text.translate(punct_trans)) - text.count(" ") - text.count("\n")


def count_words(text: str) -> int:
"""Counts the number of words in a document."""
return len(text.split())


def count_sentences(text: str) -> int:
"""Counts the number of sentences in a document."""
return text.count(".") + text.count("!") + text.count("?")


def count_paragraphs(text: str) -> int:
"""Counts the number of paragraphs in a document."""
return text.count("\n\n")
84 changes: 84 additions & 0 deletions statistics/reading_indexes.py
@@ -0,0 +1,84 @@
from .counters import *
from spacy import tokens


def coleman_liau_index(text: str) -> float:
"""
Similar to the Automated Readability Index, but unlike most of the other
grade-level predictors, the Coleman–Liau relies on characters instead of
syllables per word. Instead of using syllable/word and sentence length
indices, Meri Coleman and T. L. Liau believed that computerized assessments
understand characters more easily and accurately than counting syllables
and sentence length.
CLI = (0.0588 x L) - (0.296 x S) - 15.8
CLI: Coleman Liau Index
L: average number of Letters per 100 words
S: average number of sentences per 100 words
"""
words = text.split()
word_groups = ["".join(words[i : i + 100]) for i in range(0, len(words), 100)]
letter_count_list = [len(l) for l in word_groups]
l = sum(letter_count_list) / len(letter_count_list)
sentence_count_list = [count_sentences(s) for s in word_groups]
s = sum(sentence_count_list) / len(sentence_count_list)
return (0.0588 * l) - (0.296 * s) - 15.8


def flesch_reading_ease(text: str, doc: tokens.Doc) -> float:
"""
The Flesch Reading Ease Formula is a simple approach to assess the
grade-level of the reader. It’s also one of the few accurate measures
around that we can rely on without too much scrutiny. This formula is best
used on school text. It has since become a standard readability formula
used by many US Government Agencies, including the US Department of
Defense. However, primarily, we use the formula to assess the difficulty
of a reading passage written in English.
RE = 206.835 – (1.015 x ASL) – (84.6 x ASW)
RE: Reading Ease
ASL: Average Sentence Length
ASW: Average number of Syllables per Word
"""
asl = count_words(text) / count_sentences(text)
asw = count_syllables(doc) / count_words(text)
return 206.835 - (1.015 * asl) - (84.6 * asw)


def flesch_kincaid_reading_age(text: str, doc: tokens.Doc) -> float:
"""
Flesch Kincaid reading age is based on the Flesch Reading Ease formula
but corrosponds to grade school levels in the United States.
FKRA = (0.39 x ASL) + (11.8 x ASW) - 15.59
FKRA: Flesch Kincaid Reading Age
ASL: Average Sentence Length
ASW: Average number of Syllables per Word
"""
asl = count_words(text) / count_sentences(text)
asw = count_syllables(doc) / count_words(text)
return (0.39 * asl) + (11.8 * asw) - 15.59


def automated_readablitity_index(text: str) -> float:
"""
The Automated Readability Index is derived from ratios representing word
difficulty (number of letters per word) and sentence difficulty (number of
words per sentence). The first consideration in developing the Automated
Readability Index was to establish that the factors used relate to those
found in other indices. The factor relating to sentence structure (average
number of words per sentence) is identical to that found in most currently
used indices , such as the Coleman-Liau Index, so no verification was
required. The verification of the relationship between the word structure
factor was also virtually self-evident.
ARI = 4.71 * (L/W) + 0.5 * (W/S) - 21.43
ARI: automated Readability Index
L: number of letters
W: number of words
S: number of sentences
"""
l = count_letters(text)
w = count_words(text)
s = count_sentences(text)
return 4.71 * (l / w) + 0.5 * (w / s) - 21.43
6 changes: 6 additions & 0 deletions statistics/test_counters.py
@@ -0,0 +1,6 @@
from . import counters


def test_naive_syllables_in_word():
assert counters.naive_syllables_in_word("hello") == 2
assert counters.naive_syllables_in_word(";") == 0
Empty file.
5 changes: 0 additions & 5 deletions test_main.py
Expand Up @@ -3,8 +3,3 @@

def test_noop():
assert main.noop() == None


def test_naive_syllables_in_word():
assert main.naive_syllables_in_word("hello") == 2
assert main.naive_syllables_in_word(";") == 0

0 comments on commit a1a4727

Please sign in to comment.