In [17]:
import pandas as pd
from spacy.lang.en import English
from os import listdir, makedirs
from os.path import isdir, isfile, join, dirname

In [18]:
class Readability:

	def __init__(self):
		pass

	# Calculate syllables in the content
	def syllable_count(self, content):

		n_syllables = 0
		# For every word in the content
		for word in content.split():

			word = word.lower()
			count = 0
			vowels = "aeiouy"

			if word[0] in vowels:
				count += 1

			for index in range(1, len(word)):
				if word[index] in vowels and word[index - 1] not in vowels:
					count += 1

			# Not considering "e" if its at the end of the word
			if word.endswith("e"):
				count -= 1

			# Considering minimum 1 syllable
			if count == 0:
				count += 1
			# Syllables count for complete content
			n_syllables = n_syllables + count

		return n_syllables

	# Calculating number of sentences with Spacy
	def sentence_count(self, content):
		nlp = English()
		# sentencizer = nlp.create_pipe("sentencizer")
		nlp.add_pipe("sentencizer")

		doc = nlp(content)
		n_sentences = len(list(doc.sents))

		return n_sentences

	# Calculating number of words with Spacy
	def word_count(self, content):
		nlp = English()
		doc = nlp(content)

		words = [token.text for token in doc]
		n_words = len(words)

		return n_words

	# Calculating reading ease score as per formula
	def get_reading_ease_score(self, n_words, n_sentences, n_syllables):
		# Flesch reading ease formula
		score = 206.835 - 1.015 * (n_words / n_sentences) - 84.6 * (n_syllables / n_words)
		score = round(score, 2)
		return score

	# Calculating grade level score as per formula
	def get_grade_level_score(self, n_words, n_sentences, n_syllables):
		# Flesch–Kincaid grade level formula
		score = 0.39 * (n_words / n_sentences) + 11.8 * (n_syllables / n_words) - 15.59
		score = round(score, 2)
		return score

In [19]:
HEALTH_TOPIC_SUMMARIES_PATH = '/Users/simi/Documents/SFSU/Research/QMOHI_tool/util/Ideal Document Generation/health_topics_summary'

In [22]:
readability = Readability()
output_df = pd.DataFrame(columns=['filename', 'n_sentences', 'n_words', 'n_syllables', 'reading_ease', 'grade_level'])

health_topic_files = [f for f in listdir(HEALTH_TOPIC_SUMMARIES_PATH) if isfile(join(HEALTH_TOPIC_SUMMARIES_PATH, f))]

for i, health_topic_file in enumerate(health_topic_files):
    with open(join(HEALTH_TOPIC_SUMMARIES_PATH, health_topic_file)) as f:
        text = f.read()

    # Calculate all the numbers required for the formula
    n_sentences = readability.sentence_count(text)
    n_words = readability.word_count(text)
    n_syllables = readability.syllable_count(text)

    # Calculate reading ease
    reading_ease = readability.get_reading_ease_score(n_words, n_sentences, n_syllables)

    # Calculate grade level
    grade_level = readability.get_grade_level_score(n_words, n_sentences, n_syllables)

    dct = {
        'filename': [health_topic_file],
        'n_sentences': [n_sentences],
        'n_words': [n_words],
        'n_syllables': [n_syllables],
        'reading_ease': [reading_ease],
        'grade_level': [grade_level],
    }
    df = pd.DataFrame(dct)
    output_df = pd.concat([output_df, df], ignore_index = True)
    output_df.reset_index()


output_df.to_csv("train_data_health_topics.csv")

n_words = 134
n_sentences =  9
n_syllables =  173
n_words = 153
n_sentences =  13
n_syllables =  219
n_words = 188
n_sentences =  8
n_syllables =  269
n_words = 250
n_sentences =  15
n_syllables =  369
n_words = 225
n_sentences =  13
n_syllables =  318
n_words = 798
n_sentences =  38
n_syllables =  981
n_words = 153
n_sentences =  10
n_syllables =  218
n_words = 107
n_sentences =  6
n_syllables =  150
n_words = 912
n_sentences =  48
n_syllables =  1361
n_words = 152
n_sentences =  9
n_syllables =  210
n_words = 396
n_sentences =  21
n_syllables =  593
n_words = 121
n_sentences =  6
n_syllables =  154
n_words = 787
n_sentences =  55
n_syllables =  1136
n_words = 177
n_sentences =  6
n_syllables =  238
n_words = 1091
n_sentences =  67
n_syllables =  1504
n_words = 139
n_sentences =  8
n_syllables =  172
n_words = 150
n_sentences =  10
n_syllables =  199
n_words = 581
n_sentences =  37
n_syllables =  769
n_words = 112
n_sentences =  6
n_syllables =  167
n_words = 595
n_sentences =  33
n_s