# Importing Libraries

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from scipy import stats
import time


In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
stop_words = set(stopwords.words("english"))

from lexical_diversity import lex_div as ld


#### Setting display width to maximum- veryimportant

In [11]:
pd.set_option('display.max_colwidth', None)


# Data

In [13]:
df= pd.read_excel("clean.xlsx")

# Preprocessing

## Tokenize

In [14]:
dfcopy = df.copy()

In [15]:
def tknize(x):
    try:
        return word_tokenize(x)
    except:
        pass

In [16]:
dfcopy["Comment"] = dfcopy["Comment"].apply(lambda x : tknize(x) )

In [17]:
dfcopy.head()

Unnamed: 0,name,Comment
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]"
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]"
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]"
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]"
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]"


# Lexical Diversity Measures

# Simple TTR

In [18]:
def ttr(x):
    try:
        return ld.ttr(x)
    except:
        pass

In [19]:
dfcopy["ttr"]=dfcopy["Comment"].apply(lambda x : ttr(x) )

In [20]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571


# Root TTR

In [21]:
def rttr(x):
    try:
        return ld.root_ttr(x)
    except:
        pass

In [22]:
dfcopy["rttr"]=dfcopy["Comment"].apply(lambda x : rttr(x) )

In [23]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396


# Log TTR

In [24]:
def lttr(x):
    try:
        return ld.log_ttr(x)
    except:
        pass

In [25]:
dfcopy["logttr"]=dfcopy["Comment"].apply(lambda x : lttr(x) )

In [26]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919


# Mass TTR

In [27]:
#function for mttr and lttr because math error, log involved.
def mttr(x):
    try:
        return ld.maas_ttr(x)
    except:
        pass

In [28]:
dfcopy["mttr"]=dfcopy["Comment"].apply(lambda x : mttr(x) )

In [29]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr,mttr
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0,0.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764,0.039992
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0,0.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0,0.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919,0.024501


# Mean segmental TTR (MSTTR)

In [30]:
def msttr(x):
    try:
        return ld.msttr(x)
    except:
        pass

In [31]:
dfcopy["msttr"]=dfcopy["Comment"].apply(lambda x : msttr(x) )

In [32]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr,mttr,msttr
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0,0.0,1.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764,0.039992,0.770833
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0,0.0,1.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0,0.0,1.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919,0.024501,0.928571


# Moving average TTR (MATTR)


In [33]:
def mattr(x):
    try:
        return ld.mattr(x)
    except:
        pass

In [34]:
dfcopy["mattr"]=dfcopy["Comment"].apply(lambda x : mattr(x) )

In [35]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr,mttr,msttr,mattr
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0,0.0,1.0,1.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764,0.039992,0.770833,0.770833
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0,0.0,1.0,1.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0,0.0,1.0,1.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919,0.024501,0.928571,0.928571


# Hypergeometric distribution D (HDD)


In [36]:
def hdd(x):
    try:
        return ld.hdd(x)
    except:
        pass

In [37]:
dfcopy["hdd"]=dfcopy["Comment"].apply(lambda x : hdd(x) )

In [38]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr,mttr,msttr,mattr,hdd
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0,0.0,1.0,1.0,0.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764,0.039992,0.770833,0.770833,0.790056
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0,0.0,1.0,1.0,0.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0,0.0,1.0,1.0,0.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919,0.024501,0.928571,0.928571,0.0


# Measure of lexical textual diversity (MTLD)


In [39]:
def mtld(x):
    try:
        return ld.mtld(x)
    except:
        pass

In [40]:
dfcopy["mtld"]=dfcopy["Comment"].apply(lambda x : mtld(x) )

In [41]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr,mttr,msttr,mattr,hdd,mtld
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764,0.039992,0.770833,0.770833,0.790056,58.647273
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919,0.024501,0.928571,0.928571,0.0,54.88


# Measure of lexical textual diversity (moving average, wrap)


In [42]:
def mtld_mawrap(x):
    try:
        return ld.mtld_ma_wrap(x)
    except:
        pass

In [43]:
dfcopy["mtld_mawrap"]=dfcopy["Comment"].apply(lambda x : mtld_mawrap(x) )

In [44]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr,mttr,msttr,mattr,hdd,mtld,mtld_mawrap
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764,0.039992,0.770833,0.770833,0.790056,58.647273,47.088889
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919,0.024501,0.928571,0.928571,0.0,54.88,19.0


# Measure of lexical textual diversity (moving average, bi-directional)


In [45]:
def mtld_mabid(x):
    try:
        return ld.mtld_ma_bid(x)
    except:
        pass

In [46]:
dfcopy["mtld_mabid"]=dfcopy["Comment"].apply(lambda x : mtld_mabid(x) )

In [47]:
dfcopy.head()

Unnamed: 0,name,Comment,ttr,rttr,logttr,mttr,msttr,mattr,hdd,mtld,mtld_mawrap,mtld_mabid
0,lucid conch,"[The, wandering, whittler, lightly, now, because, whistle, quietly, listen, for, a, little, crow, Feathered, quickly, fly]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0,0.0
1,steven armstrong,"[Neil, says, there, is, no, reason, the, earth, goes, around, the, sun, in, the, time, it, does, withing, a, third, of, an, increment, for, leap, years, But, why, isnt, he, saying, there, has, to, be, a, reason, the, earth, orbits, the, sun, in, a, slight, off, beat, pattern]",0.770833,5.34049,0.932764,0.039992,0.770833,0.770833,0.790056,58.647273,47.088889,0.0
2,willieam sandren,"[The, clever, cardboard, numerically, please, because, dimple, largely, intend, behind, a, useful, chime, wistful, fluttering, gemini]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0,0.0
3,niles graham,"[The, tasty, bonsai, indirectly, scream, because, uncle, approximately, strip, near, a, onerous, crate, knotty, disastrous, collar]",1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0,0.0
4,irfan nasyriq,"[now, the, fans, of, jre, knows, that, the, earth, is, shape, like, a, skull]",0.928571,3.474396,0.971919,0.024501,0.928571,0.928571,0.0,54.88,19.0,0.0


# Results

In [48]:
neilresults = dfcopy[["ttr","rttr","logttr","mttr","msttr","mattr","hdd","mtld","mtld_mawrap","mtld_mabid"]]

In [49]:
neilresults.head()

Unnamed: 0,ttr,rttr,logttr,mttr,msttr,mattr,hdd,mtld,mtld_mawrap,mtld_mabid
0,1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0,0.0
1,0.770833,5.34049,0.932764,0.039992,0.770833,0.770833,0.790056,58.647273,47.088889,0.0
2,1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0,0.0
3,1.0,4.0,1.0,0.0,1.0,1.0,0.0,0.0,23.0,0.0
4,0.928571,3.474396,0.971919,0.024501,0.928571,0.928571,0.0,54.88,19.0,0.0


In [50]:
neilresults.describe()

Unnamed: 0,ttr,rttr,logttr,mttr,msttr,mattr,hdd,mtld,mtld_mawrap,mtld_mabid
count,14024.0,14024.0,14023.0,14023.0,14024.0,14024.0,14024.0,14024.0,14024.0,14024.0
mean,0.932228,3.828348,0.963415,0.016554,0.940572,0.940273,0.124081,39.306576,26.551656,3.744586
std,0.098059,1.700927,0.122523,0.060808,0.081528,0.081729,0.302192,57.187813,26.24728,16.435732
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.888889,2.645751,0.960253,0.0,0.893333,0.891304,0.0,0.0,10.0,0.0
50%,1.0,3.474396,1.0,0.0,1.0,1.0,0.0,0.0,19.0,0.0
75%,1.0,4.669738,1.0,0.02618,1.0,1.0,0.0,69.978462,33.128968,0.0
max,1.0,16.694483,1.0,3.321928,1.0,1.0,0.979293,567.0,257.0,233.847826


In [51]:
#neilresults.to_csv("neilresults", index=False, encoding = 'utf-8')