<a href="https://colab.research.google.com/github/beyounding/nlp_projects/blob/main/lexical_diversity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



> **Preprocessing**



In [None]:
!pip install pymorphy2

In [2]:
from string import punctuation
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy2
from nltk.probability import FreqDist
import pandas as pd
import re
import matplotlib.pyplot as plt


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Error loading corpus: Package 'corpus' not found in index


False

In [3]:
def preprocessing(text):

  text.lower()
  token = word_tokenize(text)
  russian_stopwords = stopwords.words('russian') + [a for a in punctuation]
  token_2 = [word for word in token if word not in russian_stopwords]
  morph = pymorphy2.MorphAnalyzer()
  for ii in range(len(token_2)):
    token_2[ii] = morph.parse(token_2[ii])[0].normal_form
  text=' '.join(token_2)
  text=re.sub(r'[^\w\s]+|[\d]+', r'', text).strip()

  types = len(set(text))
  tokens = len(text)
  strings = len(text)/10

  return(types,tokens,strings)

In [4]:
with open("nature.txt", "r") as file:
    text_science = file.read()


types1,tokens1,strings1 = preprocessing(text_science)


> **Lexical analysis**



In [5]:
import math

In [6]:
def ttr(types, tokens):
  ttr = types/tokens
  return ('%.2f' % ttr)

def rttr(types, tokens):
  rttr = types/(tokens**0.5)
  return ('%.2f' % rttr)

def ttrc(types, tokens):
  ttrc = types/((2*tokens)**0.5)
  return ('%.2f' % ttrc)

def ih(types, tokens):
  ih = math.log(types)/math.log(tokens)
  return ('%.2f' % ih)

def uber_index(types, tokens):
  uber = (math.log(types))**2/(math.log(types) - math.log(tokens))
  return ('%.2f' % uber)

def somers_index(types, tokens):
  somers = math.log(math.log(types))/math.log(math.log(tokens))
  return ('%.2f' % somers)

def maas_index(types, tokens):
  maas = (math.log(tokens) - math.log(types)/(math.log(tokens))**2)**0.5
  return('{:#.2} '.format(maas))

def mltd(tokens, strings):
  mltd = tokens/strings
  return ('%.2f' % mltd)

In [7]:
print('=======Results=======')
print('TTR is ' + str(ttr(types1, tokens1)))
print('RTTR is ' + str(rttr(types1, tokens1)))
print('TTR(c) is ' + str(ttrc(types1, tokens1)))
print('IH is ' + str(ih(types1, tokens1)))
print('Uber index is ' + str(uber_index(types1, tokens1)))
print('Somers index is ' + str(somers_index(types1, tokens1)))
print('Maas index is ' + str(maas_index(types1, tokens1)))
print('MLTD is ' + str(mltd(types1, tokens1)))

TTR is 0.08
RTTR is 1.50
TTR(c) is 1.06
IH is 0.57
Uber index is -4.44
Somers index is 0.68
Maas index is 2.4 
MLTD is 0.08
