In [2]:
from pathlib import Path
import spacy
from lexical_diversity import lex_div as ld
import pandas as pd
import collections.abc
collections.Iterable = collections.abc.Iterable
import math

# If using local Jupyter Notebook
import nbimporter
from Custom_Adv_G import c_adv_guiraud

# If using Google Colab
#%run /content/drive/MyDrive/Example/Functions/Custom_Adv_G.ipynb import c_adv_guiraud

nlp = spacy.load('en_core_web_sm')

scores_o = pd.DataFrame(columns = ['TTR', 'Root TTR', 'Log TTR', 'Mass TTR', 'MSTR', 'MATTR', 'HDD', 'MTLD', 'MTLD wrap', 'MTLD bi', '# words', 'Custom AG'])

# read in original story
for p in Path('Data/Original').glob('*.txt'):
    num_words = 0
    with p.open() as f:
        story = f.read()
        
        lines = story.split()
        num_words = len(lines)
        
        story_nlp  = nlp(story)
            
        text = []
        
        # tokenization
        for i in story_nlp:
            if i.text != '\n':
                text.append(i.lemma_)
        
        # Analysis dataframe
        story_db = []
        
        ttr = ld.ttr(text) #basic TTR
        story_db.append(ttr)
        
        rttr = ld.root_ttr(text) # root TTR
        story_db.append(rttr)
        
        lttr = ld.log_ttr(text) # log TTR
        story_db.append(lttr)
        
        mttr = ld.maas_ttr(text) # Maas TTR or Mass?
        story_db.append(mttr)
        
        msttr = ld.msttr(text) # mean segmental TTR with default 50 word window
        story_db.append(msttr)
        
        mattr = ld.mattr(text) # moving average TTR with default 50 word segment
        story_db.append(mattr)
        
        hdd = ld.hdd(text) # hypergeometric distribution D
        story_db.append(hdd)
        
        mtld = ld.mtld(text) # measure of lexical textual diversity
        story_db.append(mtld)
        
        mtldw = ld.mtld_ma_wrap(text) # measure of lexical textual diversity (moving average, wrap)
        story_db.append(mtldw)
        
        mtldb = ld.mtld_ma_bid(text) # measure of lexical textual diversity (moving average, bidirectional)
        story_db.append(mtldb)
        
        story_db.append(num_words)
        
        cadvg = c_adv_guiraud(text) # customized Advanced Guiraud
        story_db.append(cadvg)
        
        scores_o.loc[len(scores_o)] = story_db
        
    print("There are " + str(num_words) + " words.")
    
print(scores_o)

scores_o.to_csv(r'Data/scores_o.csv')


There are 612 words.
        TTR  Root TTR   Log TTR  Mass TTR  MSTR     MATTR       HDD  \
0  0.375204  9.289613  0.847269  0.054792  0.73  0.726383  0.768649   

        MTLD  MTLD wrap    MTLD bi  # words  Custom AG  
0  47.683665  49.386623  47.882322    612.0   1.454026  
