# Text Feature Extractor (Sentence Level)
_Click 'Run All' to extract text features from a sentence input._
***

## Traditional Text Features

#### Library Imports & Set-up

In [15]:
#import libraries for traditional text features
import os
from os import path

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
import string
import re
import syllables
import csv
import pandas as pd
import numpy as np
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import stopwordsiso
from stopwordsiso import stopwords

import matplotlib.pyplot as plt
% matplotlib inline

nltk.download('punkt')

#import libraries for lexical text features
import os
import nltk
from nltk import *
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import math



#SET UP FOR POS TAGGER
# input local path to java.exe
java_path = "C:/Program Files/Java/jre1.8.0_341/bin/java.exe" 
os.environ["JAVAHOME"] = java_path

#path to POS tagger jar
jar_path = os.getcwd()
jar =  jar_path + "/stanford-postagger.jar"

# path to POS tagger model
model_path = jar_path +"/POSTagger/"
model = model_path + "filipino-left5words-owlqn2-distsim-pref6-inf2.tagger"

pos_tagger = StanfordPOSTagger(model, jar, encoding = "utf-8")

UsageError: Line magic function `%` not found.


#### Sentence Input & Cleaning

In [16]:
#Get sentence input from user
sentence_input = input("Input a sentence: ")
print("Your input: ", sentence_input)

#Remove punctuations
clean_input = re.sub(r'[^\w\s]', '', sentence_input)
print("Cleaned text: ", clean_input)

Your input:  hello hi, magandang umaga po, kumusta ka?
Cleaned text:  hello hi magandang umaga po kumusta ka


#### Word Counter

In [17]:
words = sentence_input.split()
print("Number of Words: ", len(words))

Number of Words:  7


#### Average Word Length

In [18]:
total_word_length = sum(len(word) for word in words)
avg_word_length = total_word_length / len(words)

print("Average Word Length: ", avg_word_length, "letters")

Average Word Length:  5.0 letters


#### Total Syllables

In [19]:
# https://github.com/itudidyay/Tagalog-Word-Syllabization-Python
# https://pypi.org/project/syllables/

vowels = 'aeiou'
consonants = 'bcdfghjklmnpqrstvwxyz'

total_syllables = 0

def count_syllables(text):
    global total_syllables
    tokens = word_tokenize(text)

    for token in tokens:
        for char in token:
            if char in vowels:
                total_syllables += 1
        
        # edge cases
        if token == 'ng' or token == 'mga': # edge case ng, mga
            total_syllables += 1
        
        elif (('io') in token): # edge case -io in names/surnames
            total_syllables -= 1

    return total_syllables

def main():

    total_syllables = count_syllables(clean_input)

    print(f"Total Syllables: {total_syllables}")

if __name__ == "__main__":
    main()

Total syllables in the text file: 14


***
***
## Lexical Text Features
#### Input Tokenization & POS Tagging

In [None]:
#tokenize text input
words = nltk.word_tokenize(clean_input)

#tag tokenized words
tagged_words = pos_tagger.tag(words)

#### Noun-Token Ratio

In [None]:
# NOUN COUNT
noun_count = 0
for word, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if tag.startswith('NN'):
        noun_count += 1
    
print("Number of nouns: ", noun_count)

# NOUN TOKEN RATIO
# = noun_count/total_token_count
total_token_count = len(words)
noun_token_ratio = noun_count/total_token_count

print("Total number of tokens: ", total_token_count)
print("Noun-Token Ratio: ", noun_token_ratio)

#### Verb-Token Ratio

In [None]:
# VERB COUNT
verb_count = 0
for word, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if tag.startswith('VB'):
        verb_count += 1
        
print("Number of verbs: ", verb_count)

# VERB TOKEN RATIO
# = verb_count/total_token_count
total_token_count = len(words)
verb_token_ratio = verb_count/total_token_count

print("Total number of tokens: ", total_token_count)
print("Noun-Token Ratio: ", verb_token_ratio)

#### Type-Token Ratio

In [None]:
# count unique lexical categories
unique_categories = set()
for _, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if len(tag) >= 2:  # make sure the tag is not empty
        category = tag[:2]  # extract the first two letters
        unique_categories.add(category)

print("Unique Categories:", unique_categories)

#NUMBER OF UNIQUE CATEGORIES
num_categories = len(unique_categories)
print("Number of Unique Categories:", num_categories)

# TOTAL NUM OF TOKENS
total_token_count = len(words)

# TYPE TOKEN RATIO
ttr = num_categories/total_token_count
print("Type-Token Ratio: ", ttr)

#ROOT TTR
root_ttr = num_categories/math.sqrt(total_token_count)
print("Root Type-Token Ratio: ", root_ttr)

#CORR TTR
corr_ttr = num_categories/math.sqrt(2*total_token_count)
print("Corrected Type-Token Ratio: ", corr_ttr)

#BILOGARITHMIC TTR
log_ttr = math.log(num_categories)/math.log(total_token_count)
print("Bilogarithmic Type-Token Ratio: ", log_ttr)

#### Lexical Density

In [None]:
# NUMBER OF LEXICAL WORDS
# count number of nouns, verbs, adjectives, and adverbs
num_lexwords = 0
for word, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB'):
        num_lexwords += 1
        
print("Number of lexical words: ", num_lexwords)

# LEXICAL DENSITY
# = lex_density/total_token_count
total_token_count = len(words)
lex_density = num_lexwords/total_token_count

print("Total number of tokens: ", total_token_count)
print("Lexical Density: ", lex_density)

#### Foreign Word-Token Ratio

In [None]:
# FOREIGN WORD COUNT
fw_count = 0
for word, tag in tagged_words:
    tag = tag.split('|')[-1] #removes word before |
    if tag.startswith('FW'):
        fw_count += 1
        
print("Number of foreign words: ", fw_count)

# FOREIGN WORD - TOKEN RATIO
# = fw_count/total_token_count
total_token_count = len(words)
fw_token_ratio = fw_count/total_token_count

print("Total number of tokens: ", total_token_count)
print("Foreign Word-Token Ratio: ", fw_token_ratio)