# Scripts for Lexical Text Feature Extraction

Import POS Tagger & Libraries

In [10]:
import os
import nltk
from nltk import *
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import math

# input local path to java.exe
java_path = "/usr/bin/java" 
os.environ["JAVAHOME"] = java_path

#path to POS tagger jar
jar_path = '/Users/jerseydayao/Desktop/hckrwmn/repositories/Readability-Level-Identifier'
os.chdir(jar_path)
jar =  jar_path + "/stanford-postagger.jar"

# path to POS tagger model
model_path = jar_path +"/POSTagger/"
model = model_path + "filipino-left5words-owlqn2-distsim-pref6-inf2.tagger"

pos_tagger = StanfordPOSTagger(model, jar, encoding = "utf-8")

Text input and Tokenization

In [3]:
# text input
text = "nagsimula ang lahat sa masukal na bakuran ni aling salvacion."

#tokenize text input
words = nltk.word_tokenize(text)
temp_words = [word for word in words if word.isalnum()] # removes punctuation marks

#tag tokenized words
tagged_words = pos_tagger.tag(temp_words)

### Noun-Token Ratio

In [4]:
# NOUN COUNT
noun_count = 0
for word, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if tag.startswith('NN'):
        noun_count += 1
        
print("Number of nouns: ", noun_count)

# NOUN TOKEN RATIO
# = noun_count/total_token_count
total_token_count = len(temp_words)
noun_token_ratio = noun_count/total_token_count

print("Total number of tokens: ", total_token_count)
print("Noun-Token Ratio: ", noun_token_ratio)

Number of nouns:  2
Total number of tokens:  10
Noun-Token Ratio:  0.2


### Verb-Token Ratio

In [None]:
# VERB COUNT
verb_count = 0
for word, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if tag.startswith('VB'):
        verb_count += 1
        
print("Number of verbs: ", verb_count)

# VERB TOKEN RATIO
# = verb_count/total_token_count
total_token_count = len(temp_words)
verb_token_ratio = verb_count/total_token_count

print("Total number of tokens: ", total_token_count)
print("Noun-Token Ratio: ", verb_token_ratio)

### Type-Token Ratio

In [None]:
# count unique lexical categories
unique_categories = set()
for _, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if len(tag) >= 2:  # make sure the tag is not empty
        category = tag[:2]  # extract the first two letters
        unique_categories.add(category)

print("Unique Categories:", unique_categories)

#NUMBER OF UNIQUE CATEGORIES
num_categories = len(unique_categories)
print("Number of Unique Categories:", num_categories)

# TOTAL NUM OF TOKENS
total_token_count = len(temp_words)

# TYPE TOKEN RATIO
ttr = num_categories/total_token_count
print("Type-Token Ratio: ", ttr)

#ROOT TTR
root_ttr = num_categories/math.sqrt(total_token_count)
print("Root Type-Token Ratio: ", root_ttr)

#CORR TTR
corr_ttr = num_categories/math.sqrt(2*total_token_count)
print("Corrected Type-Token Ratio: ", corr_ttr)

#BILOGARITHMIC TTR
log_ttr = math.log(num_categories)/math.log(total_token_count)
print("Bilogarithmic Type-Token Ratio: ", log_ttr)

### Lexical Density

In [None]:
# NUMBER OF LEXICAL WORDS
# count number of nouns, verbs, adjectives, and adverbs
num_lexwords = 0
for word, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB'):
        num_lexwords += 1
        
print("Number of lexical words: ", num_lexwords)

# LEXICAL DENSITY
# = lex_density/total_token_count
total_token_count = len(temp_words)
lex_density = num_lexwords/total_token_count

print("Total number of tokens: ", total_token_count)
print("Lexical Density: ", lex_density)