In [2]:
from pathlib import Path
import spacy
from lexical_diversity import lex_div as ld
import pandas as pd
import collections.abc
collections.Iterable = collections.abc.Iterable
import nbimporter
from Custom_Adv_G import c_adv_guiraud
import math
           

nlp = spacy.load('en_core_web_sm')

scores_2 = pd.DataFrame(columns = ['TTR', 'Root TTR', 'Log TTR', 'Mass TTR', 'MSTR', 'MATTR', 'HDD', 'MTLD', 'MTLD wrap', 'MTLD bi', '# words', 'Custom AG'])

# read in Group 2 stories
for p in Path('Data/Group 2').glob('*.txt'):
    num_words = 0
    with p.open(encoding = 'UTF-8') as f:
        story = f.read()
        
        lines = story.split()
        num_words = len(lines)
        
        story_nlp  = nlp(story)
            
        text = []
        
        # tokenization
        for i in story_nlp:
            if i.text != '\n' and not i.is_punct: 
                text.append(i.lemma_)
        
        story_db = []
        
        ttr = ld.ttr(text) #basic TTR
        story_db.append(ttr)
            
        rttr = ld.root_ttr(text) # root TTR
        story_db.append(rttr)
        
        lttr = ld.log_ttr(text) # log TTR
        story_db.append(lttr)
        
        mttr = ld.maas_ttr(text) # Maas TTR or Mass?
        story_db.append(mttr)
        
        msttr = ld.msttr(text) # mean segmental TTR with default 50 word window
        story_db.append(msttr)
       
        mattr = ld.mattr(text) # moving average TTR with default 50 word segment
        story_db.append(mattr)
        
        hdd = ld.hdd(text) # hypergeometric distribution D
        story_db.append(hdd)
       
        mtld = ld.mtld(text) # measure of lexical textual diversity
        story_db.append(mtld)
        
        mtldw = ld.mtld_ma_wrap(text) # measure of lexical textual diversity (moving average, wrap)
        story_db.append(mtldw)
        
        mtldb = ld.mtld_ma_bid(text) # measure of lexical textual diversity (moving average, bidirectional)
        story_db.append(mtldb)
        
        story_db.append(num_words)
        
        cadvg = c_adv_guiraud(text) # customized Advanced Guiraud
        story_db.append(cadvg)
        
        scores_2.loc[len(scores_2)] = story_db
        
    print("There are " + str(num_words) + " words.")
 
print(scores_2)

scores_2.to_csv(r'Data/scores_2.csv')

There are 471 words.
There are 485 words.
There are 480 words.
There are 489 words.
There are 504 words.
        TTR   Root TTR   Log TTR  Mass TTR      MSTR     MATTR       HDD  \
0  0.461378  10.097748  0.874664  0.046761  0.762222  0.762558  0.788267   
1  0.444219   9.863264  0.869134  0.048598  0.753333  0.755315  0.791926   
2  0.451745   9.969152  0.871590  0.047780  0.740000  0.748539  0.783133   
3  0.442424   9.843318  0.868567  0.048776  0.724444  0.741794  0.780272   
4  0.451866  10.194571  0.872543  0.047089  0.754000  0.754174  0.791551   

        MTLD  MTLD wrap    MTLD bi  # words  Custom AG  
0  58.454722  59.866388  60.363521    471.0   1.736264  
1  58.659213  57.525355  58.539377    485.0   1.396170  
2  58.753007  57.252567  54.626262    480.0   1.812573  
3  52.633391  52.359596  51.572583    489.0   1.663026  
4  60.072535  57.719057  56.321416    504.0   1.684320  
