In [2]:
from pathlib import Path
import spacy
from lexical_diversity import lex_div as ld
import pandas as pd
import collections.abc
collections.Iterable = collections.abc.Iterable
import math

# If using local Jupyter Notebook
import nbimporter
from Custom_Adv_G import c_adv_guiraud

# If using Google Colab
#%run /content/drive/MyDrive/Example/Functions/Custom_Adv_G.ipynb import c_adv_guiraud

nlp = spacy.load('en_core_web_sm')

scores_1 = pd.DataFrame(columns = ['TTR', 'Root TTR', 'Log TTR', 'Mass TTR', 'MSTR', 'MATTR', 'HDD', 'MTLD', 'MTLD wrap', 'MTLD bi', '# words', 'Custom AG'])

# read in Group 1 stories
for p in Path('Data/Group 1').glob('*.txt'):
    num_words = 0
    with p.open(encoding = 'UTF-8') as f:
        story = f.read()
        
        lines = story.split()
        num_words = len(lines)
        
        story_nlp  = nlp(story)
            
        text = []
        
        # tokenization
        for i in story_nlp:
            if i.text != '\n' and not i.is_punct: 
                text.append(i.lemma_)
        
        story_db = []
        
        ttr = ld.ttr(text) #basic TTR
        story_db.append(ttr)
            
        rttr = ld.root_ttr(text) # root TTR
        story_db.append(rttr)
        
        lttr = ld.log_ttr(text) # log TTR
        story_db.append(lttr)
        
        mttr = ld.maas_ttr(text) # Maas TTR or Mass?
        story_db.append(mttr)
        
        msttr = ld.msttr(text) # mean segmental TTR with default 50 word window
        story_db.append(msttr)
       
        mattr = ld.mattr(text) # moving average TTR with default 50 word segment
        story_db.append(mattr)
        
        hdd = ld.hdd(text) # hypergeometric distribution D
        story_db.append(hdd)
       
        mtld = ld.mtld(text) # measure of lexical textual diversity
        story_db.append(mtld)
        
        mtldw = ld.mtld_ma_wrap(text) # measure of lexical textual diversity (moving average, wrap)
        story_db.append(mtldw)
        
        mtldb = ld.mtld_ma_bid(text) # measure of lexical textual diversity (moving average, bidirectional)
        story_db.append(mtldb)
        
        story_db.append(num_words)
        
        cadvg = c_adv_guiraud(text) # customized Advanced Guiraud
        story_db.append(cadvg)
        
        scores_1.loc[len(scores_1)] = story_db
        
    print("There are " + str(num_words) + " words.")
 
print(scores_1)

scores_1.to_csv(r'Data/scores_1.csv')

There are 501 words.
There are 476 words.
There are 473 words.
There are 497 words.
There are 491 words.
        TTR   Root TTR   Log TTR  Mass TTR      MSTR     MATTR       HDD  \
0  0.428008   9.637308  0.863753  0.050368  0.732000  0.743886  0.776827   
1  0.422037   9.256003  0.860317  0.052079  0.720000  0.719583  0.756950   
2  0.450939   9.869292  0.870956  0.048145  0.748889  0.746093  0.779907   
3  0.430279   9.640552  0.864387  0.050214  0.726000  0.732053  0.778521   
4  0.461847  10.306546  0.875613  0.046117  0.742222  0.750334  0.787369   

        MTLD  MTLD wrap    MTLD bi  # words  Custom AG  
0  54.412078  53.437870  53.505305    501.0   1.687639  
1  46.865400  46.012474  45.999370    476.0   1.595863  
2  55.836036  53.835073  52.565840    473.0   1.644882  
3  50.355225  50.498008  48.588108    497.0   1.517494  
4  59.222276  58.385542  55.167971    491.0   1.747632  
