In [1]:
import topycal
import os
import glob
import random

INSTR_PATH = os.path.join(os.getcwd(),"afi_txt")

In [2]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
def do_fn_on_iter(fn, iterator, num_threads=6):
    futures = []
    if isinstance(num_threads, str):
        num_threads = int(num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for elem in iterator:
            futures.append(executor.submit(fn, elem))
    results = []
    for x in as_completed(futures):
        results.append(x.result())
    return results


In [3]:
def get_file_list(limit=500, shuffle=True):
    files = glob.glob("{}/afi*.txt".format(INSTR_PATH))
    if shuffle:
        random.shuffle(files)
    if limit:
        return files[0:limit]
    else:
        return files
    #data = myfile.read()
    
def read_file(fname):
    with open(fname, errors='replace') as fd:
        return fd.read()

In [4]:
file_list = get_file_list(limit=None)


In [268]:
import re
import os

def load_file(fname):       
    with open(fname, 'r') as myfile:
        #contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
        contents = myfile.read()
        return (os.path.basename(fname),re.sub("\s+",' ', contents))

def load_corpus(file_list):
    return {f[0]:f[1] for f in do_fn_on_iter(load_file, file_list)}    

In [269]:
corpus_dict = load_corpus(file_list)

In [281]:
#corpus_dict['afi10-244.txt']

In [271]:
import spacy
nlp = spacy.load("en_core_web_md")

In [272]:
# Determine readability of a sentence
# Sentences with longer length and higher average syllable size are less readable
# for each sentence in the corpus..
# build a dict of sentences, where we characterize number of words, average length of words, average syllables/word


In [273]:
from textstat.textstat import textstat

def get_num_words(sent):
    return len([word for word in sent if word.text.isalpha()])

def get_num_syllables(sent):
    return textstat.syllable_count(sent.text)

In [274]:
def count_conj_adpositions(sentence):
    i = 0
    for word in sentence:
        if word.text.isalpha():
            #print("{}-{}".format(word, word.pos_))
            if word.pos_ in ['ADP','CONJ']:
                i += 1
    return i
            #print("{}-{}".format(word, word.pos_))

    

In [282]:
def get_sent_tree_depth(sentence):
    i = 0
    for foo in sentence.subtree:
        for foobar in foo.children:
            if foobar.text.isalpha():
                i += 1
    return i

def bin_score(syllables_per_word, num_conj_adpositions, sylpw_ceil=4, conjadp_ceil=9):
    score = 0
    # We can't do a proper mapping.. so just assume if it's over 3 that it's awful.
    if syllables_per_word > sylpw_ceil:
        syllables_per_word = sylpw_ceil
    
    score += (syllables_per_word/(sylpw_ceil*2))
    
    # Same deal here.. assume if it's over 7 that it's awful.
    if num_conj_adpositions > conjadp_ceil:
        num_conj_adpositions = conjadp_ceil
    score += (num_conj_adpositions/(conjadp_ceil*2))
    return score
        

def get_sent_complexity(sentence):
    num_words = get_num_words(sentence)
    num_syllables = get_num_syllables(sentence)
    sent_depth = get_sent_tree_depth(sentence)
    try:
        syllables_per_word = num_syllables/num_words
    except ZeroDivisionError:
        syllables_per_word = 0
    num_conj_adpositions = count_conj_adpositions(sentence)
    #print("{}-{}".format(syllables_per_word, num_conj_adpositions))
    return bin_score(syllables_per_word, num_conj_adpositions)
    #print(num_words)
    #print(num_syllables)
    #print(sent_depth)
    #print(avg_syllables_per_word)
    

In [283]:
def analyze_doc(doc_txt):
    output = []
    MIN_SENT_LEN = 12
    for sent in nlp(doc_txt).sents:
        if len(sent) > MIN_SENT_LEN:
            score = get_sent_complexity(sent)
            metadata = {'readability': score}
            output.append((sent.text.replace('\n',' ').strip(),metadata))
            #print("{}-{}".format(score,sent))
            #print(score)
        else:
            metadata = {'readability': 0}
            output.append((sent.text.replace('\n',' ').strip(),metadata))
    return output

In [284]:
analysis = analyze_doc(corpus_dict['afi10-207.txt'])

In [304]:
def list_to_paragraphs(analysis, paragraph_len=4):
    i=0
    new_list = []
    while i<len(analysis):
        new_list.append(analysis[i:i+3])   
        i+=3
    return new_list

In [306]:
paragraphs = list_to_paragraphs(analysis)

In [310]:
from yattag import Doc

def build_html(analysis):
    
    doc, tag, text = Doc().tagtext()
    with tag('div', klass='container'):
        for paragraph in analysis:
            with tag('p'):
                for item in paragraph:
                    if item[1]['readability'] < 0.25:
                        with tag('span'):
                            text(item[0])
                    elif 0.25 <= item[1]['readability'] <= 0.5:  
                        with tag('span', klass='yellow'):
                            text(item[0])
                    elif 0.5 <= item[1]['readability'] <= 0.75:  
                        with tag('span', klass='orange'):
                            text(item[0])
                    else:
                        with tag('span', klass='red',desc=item[1]['readability'] ):
                            doc.add_class('described')
                            text(item[0])
    return doc.getvalue()

In [311]:
build_html(paragraphs)

