# Text Feature Extractor (Sentence Level)
_Click 'Run All' to extract text features from a sentence input._
***

## Traditional Text Features

#### Library Imports & Set-up

In [13]:
#import libraries for traditional text features
import os
from os import path

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
import string
import re
import syllables
import csv
import pandas as pd
import numpy as np
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import stopwordsiso
from stopwordsiso import stopwords

import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('punkt')

#import libraries for lexical text features
import os
import nltk
from nltk import *
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import math


# POS-Tagger SET UP
# input local path to java.exe
java_path = "/usr/bin/java" 
os.environ["JAVAHOME"] = java_path

#path to POS tagger jar
jar_path = '/Users/jerseydayao/Desktop/hckrwmn/repositories/Readability-Level-Identifier'
os.chdir(jar_path)
jar =  jar_path + "/stanford-postagger.jar"

# path to POS tagger model
model_path = jar_path +"/POSTagger/"
model = model_path + "filipino-left5words-owlqn2-distsim-pref6-inf2.tagger"

pos_tagger = StanfordPOSTagger(model, jar, encoding = "utf-8")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jerseydayao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Word Counter

In [14]:
def word_count(text):
    words = text.split()
    return len(words)

#### Average Word Length

In [15]:
def avg_word_length(text):
    words = text.split()
    total_word_length = sum(len(word) for word in words)
    avg = total_word_length / len(words)

    return avg

# print("Average Word Length: ", avg_word_length, "letters")

#### Total Syllables

In [16]:
# https://github.com/itudidyay/Tagalog-Word-Syllabization-Python
# https://pypi.org/project/syllables/

vowels = 'aeiou'
consonants = 'bcdfghjklmnpqrstvwxyz'

def count_syllables(text):

    total_syllables = 0
    monosyl_count = 0
    polysyl_count = 0
    
    tokens = word_tokenize(text)

    for token in tokens:
        syllable_count = 0
        for char in token:
            if char.lower() in vowels:
                total_syllables += 1
                syllable_count += 1
        
        # edge cases
        if token == 'ng' or token == 'mga': # edge case ng, mga
            total_syllables += 1
            syllable_count += 1
        
        elif (('io') in token): # edge case -io in names/surnames
            total_syllables -= 1
            syllable_count -= 1
            
        if syllable_count == 1:
            monosyl_count += 1
        elif syllable_count > 1:
            polysyl_count += 1

    return total_syllables, monosyl_count, polysyl_count

# def main():
#     total_syllables = count_syllables(clean_input)

#     print(f"Total syllables in the text file: {total_syllables}")
#     print(f"Number of monosyllabic words: {monosyl_count}")
#     print(f"Number of polysyllabic words: {polysyl_count}")

# if __name__ == "__main__":
#     main()

***
***
## Lexical Text Features
#### Input Tokenization & POS Tagging

In [17]:
# #tokenize text input
# words = nltk.word_tokenize('nagsimula ang lahat sa masukal na bakuran ni aling salvacion')
# print(words)

# #tag tokenized words
# tagged_words = pos_tagger.tag(words)

#### Noun-Token Ratio

In [18]:
def ntr(words, tagged):

    # NOUN COUNT
    noun_count = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('NN'):
            noun_count += 1
        
    # print("Number of nouns: ", noun_count)

    # NOUN TOKEN RATIO
    # = noun_count/total_token_count
    total_token_count = len(words)
    noun_token_ratio = noun_count/total_token_count

    return noun_token_ratio

# print("Total number of tokens: ", total_token_count)
# print("Noun-Token Ratio: ", noun_token_ratio)

#### Verb-Token Ratio

In [27]:
def vtr(words, tagged):
    # VERB COUNT
    verb_count = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('VB'):
            verb_count += 1
            
    # print("Number of verbs: ", verb_count)

    # VERB TOKEN RATIO
    # = verb_count/total_token_count
    total_token_count = len(words)
    verb_token_ratio = verb_count/total_token_count

    return verb_token_ratio

# print("Total number of tokens: ", total_token_count)
# print("Noun-Token Ratio: ", verb_token_ratio)

#### Type-Token Ratio

In [20]:
def ttr(words, tagged):
# count unique lexical categories
    unique_categories = set()
    for _, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if len(tag) >= 2:  # make sure the tag is not empty
            category = tag[:2]  # extract the first two letters
            unique_categories.add(category)

    # print("Unique Categories:", unique_categories)

    #NUMBER OF UNIQUE CATEGORIES
    num_categories = len(unique_categories)
    # print("Number of Unique Categories:", num_categories)

    # TOTAL NUM OF TOKENS
    total_token_count = len(words)

    # TYPE TOKEN RATIO
    ttr = num_categories/total_token_count
    # print("Type-Token Ratio: ", ttr)

    #ROOT TTR
    root_ttr = num_categories/math.sqrt(total_token_count)
    # print("Root Type-Token Ratio: ", root_ttr)

    #CORR TTR
    corr_ttr = num_categories/math.sqrt(2*total_token_count)
    # print("Corrected Type-Token Ratio: ", corr_ttr)

    #BILOGARITHMIC TTR
    denominator = math.log(total_token_count)

    if denominator == 0:
        log_ttr = 0
    else:
        log_ttr = math.log(num_categories)/math.log(total_token_count)
    # print("Bilogarithmic Type-Token Ratio: ", log_ttr)

    return ttr, root_ttr, corr_ttr, log_ttr

#### Lexical Density

In [21]:

def lexical_density(words, tagged):

    # NUMBER OF LEXICAL WORDS
    # count number of nouns, verbs, adjectives, and adverbs
    num_lexwords = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB'):
            num_lexwords += 1
            
    # print("Number of lexical words: ", num_lexwords)

    # LEXICAL DENSITY
    # = lex_density/total_token_count
    total_token_count = len(words)
    lex_density = num_lexwords/total_token_count

    return lex_density

# print("Total number of tokens: ", total_token_count)
# print("Lexical Density: ", lex_density)

#### Foreign Word-Token Ratio

In [22]:
def fwtr(words, tagged):
    # FOREIGN WORD COUNT
    fw_count = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('FW'):
            fw_count += 1
            
    # print("Number of foreign words: ", fw_count)

    # FOREIGN WORD - TOKEN RATIO
    # = fw_count/total_token_count
    total_token_count = len(words)
    fw_token_ratio = fw_count/total_token_count

    return fw_token_ratio

# print("Total number of tokens: ", total_token_count)
# print("Foreign Word-Token Ratio: ", fw_token_ratio)

#### Sentence-Level Input

In [28]:
path = '/Users/jerseydayao/Desktop/hckrwmn/repositories/Readability-Level-Identifier/archive/token-sentences'
os.chdir(path)

csv_header = ['Title', 'Word Count', 'AVG Word Length', 'Total Syllables', 'MONOSYLL', 'POLYSYLL', 'NTR', 'VTR', 'TTR', 'Root TTR', 'Corrected TTR', 'BiLog TTR', 'LD', 'FWTR', 'Age Classification']
data = []

def read_text_file(file_path):
    path, file_name = os.path.split(file_path)
    suffix = '_sentenceTokens.txt'
    if file_name.lower().endswith(suffix.lower()):
        file_name = file_name[: -len(suffix)]   # e.g. Tahan na Tahanan

    with open(file_path, 'r') as file:
        print('-------------------------------------------------')
        print(file_name)
        print('-------------------------------------------------')
        
        sentence = 0
        for line in file:

            # added the age at Line 0 of each sentence token script          
            if sentence == 0:
                age = line.strip()  # output: X-Y
                sentence += 1
                continue

            csv_data = [file_name]

            # print('Sentence ', sentence)

            # TRAD
            # print('WORD COUNT: ', word_count(line))
            csv_data.append(word_count(line))

            # print('AVG WORD LENGTH: ', avg_word_length(line))
            csv_data.append(avg_word_length(line))

            # print('TOTAL SYLLABLES: ', count_syllables(line)[0])
            # print('MONOSYLLABIC: ', count_syllables(line)[1])
            # print('POLYSYLLABIC: ', count_syllables(line)[2])
            csv_data.append(count_syllables(line)[0])
            csv_data.append(count_syllables(line)[1])
            csv_data.append(count_syllables(line)[2])


            # LEX
            wordsss = nltk.word_tokenize(line)
            tagged_words = pos_tagger.tag(wordsss)
            # print('NTR: ', ntr(wordsss, tagged_words))
            csv_data.append(ntr(wordsss, tagged_words))

            # print('VTR: ', vtr(wordsss, tagged_words))
            csv_data.append(vtr(wordsss, tagged_words))

            # print('TTR: ', ttr(wordsss, tagged_words)[0])
            # print('Root-TTR: ', ttr(wordsss, tagged_words)[1])
            # print('Corrected-TTR: ', ttr(wordsss, tagged_words)[2])
            # print('Bilogarithmic-TTR: ', ttr(wordsss, tagged_words)[3])
            csv_data.append(ttr(wordsss, tagged_words)[0])
            csv_data.append(ttr(wordsss, tagged_words)[1])
            csv_data.append(ttr(wordsss, tagged_words)[2])
            csv_data.append(ttr(wordsss, tagged_words)[3])

            # print('Lexical Density: ', lexical_density(wordsss, tagged_words))
            csv_data.append(lexical_density(wordsss, tagged_words))

            # print('FWTR: ', fwtr(wordsss, tagged_words))
            csv_data.append(fwtr(wordsss, tagged_words))

            csv_data.append(age)
            data.append(csv_data)
            print(data)

            sentence += 1

    return '0'

for file in os.listdir():
    if file.endswith('.txt'):
        file_path = f'{path}/{file}'
        read_text_file(file_path)


-------------------------------------------------
Tahan na Tahanan
-------------------------------------------------
Sentence  1
[['Tahan na Tahanan', 8, 4.25, 16, 3, 5, 0.25, 0.25, 0.625, 1.7677669529663687, 1.25, 0.7739760316291208, 0.5, 0.0, '6-12']]
Sentence  2
[['Tahan na Tahanan', 8, 4.25, 16, 3, 5, 0.25, 0.25, 0.625, 1.7677669529663687, 1.25, 0.7739760316291208, 0.5, 0.0, '6-12'], ['Tahan na Tahanan', 7, 5.142857142857143, 16, 2, 5, 0.14285714285714285, 0.2857142857142857, 0.7142857142857143, 1.889822365046136, 1.3363062095621219, 0.8270874753469162, 0.42857142857142855, 0.0, '6-12']]
Sentence  3
[['Tahan na Tahanan', 8, 4.25, 16, 3, 5, 0.25, 0.25, 0.625, 1.7677669529663687, 1.25, 0.7739760316291208, 0.5, 0.0, '6-12'], ['Tahan na Tahanan', 7, 5.142857142857143, 16, 2, 5, 0.14285714285714285, 0.2857142857142857, 0.7142857142857143, 1.889822365046136, 1.3363062095621219, 0.8270874753469162, 0.42857142857142855, 0.0, '6-12'], ['Tahan na Tahanan', 8, 4.75, 16, 4, 4, 0.0, 0.125, 0.25

KeyboardInterrupt: 