# Text Feature Extractor (Book Level)
_Click 'Run All' to extract text features from a book input._
***

## Traditional Text Features

#### Library Imports

In [None]:
#import libraries for traditional text features
import os
from os import path

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
import string
import re
import syllables
import csv
import pandas as pd
import numpy as np
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import stopwordsiso
from stopwordsiso import stopwords

import matplotlib.pyplot as plt

nltk.download('punkt')

#import libraries for lexical text features
import os
import nltk
from nltk import *
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import math


# should work now without having to change the path every time a new user runs the program
curr_path = os.getcwd().replace('\clean-txt', '')

# POS-Tagger SET UP
# input local path to java.exe
java_path = "C:/Program Files/Java/jre1.8.0_341/bin/java.exe" 
os.environ["JAVAHOME"] = java_path

#path to POS tagger jar

os.chdir(curr_path)

print(curr_path)
jar =  curr_path + "/stanford-postagger.jar"

# path to POS tagger model
model_path = curr_path +"/POSTagger/"
model = model_path + "filipino-left5words-owlqn2-distsim-pref6-inf2.tagger"

pos_tagger = StanfordPOSTagger(model, jar, encoding = "utf-8")

#### File Input

In [None]:
# path = os.getcwd() + "/clean-txt/"      # gets path to 'clean-txt' directory
# filename = input("Input text filename: ")

#### Word Counter

In [None]:
def word_count(text):
    words = text.split()
    return len(words)

#### Sentence Counter

In [None]:
# folder = nltk.data.find(path)
# corpusReader = nltk.corpus.PlaintextCorpusReader(folder, filename)

# print("Number of Sentences: ", len(corpusReader.sents()))

def sentence_count(text):
    sents = nltk.sent_tokenize(text)
    return len(sents)

#### Average Word Length

In [None]:
# with open(path + "/" + filename, 'r') as file:
#     word_length = [len(word) for line in file for word in line.rstrip().split(" ")]
#     word_avg = sum(word_length)/len(word_length)
    
# print("Average Word Length: ", word_avg, "letters")

def avg_word_length(text):
    words = text.split()
    total_word_length = sum(len(word) for word in words)
    avg = total_word_length / len(words)

    return avg

# print("Average Word Length: ", avg_word_length, "letters")

#### Average Sentence Length

In [None]:

# folder = nltk.data.find(path)
# corpusReader = nltk.corpus.PlaintextCorpusReader(folder, filename)

# # SOURCE: https://stackoverflow.com/questions/35900029/average-sentence-length-for-every-text-in-corpus-python3-nltk
# avg = sum(len(sent) for sent in corpusReader.sents()) / len(corpusReader.sents())
# print("Average Sentence Length: ", avg, "words")

def avg_sent_length(text):
    sentences = nltk.sent_tokenize(text)
    avg = sum(len(sent.split()) for sent in sentences) / len(sentences)

    return avg
    

#### Total Syllables

In [None]:
# https://github.com/itudidyay/Tagalog-Word-Syllabization-Python
# https://pypi.org/project/syllables/

vowels = 'aeiou'
consonants = 'bcdfghjklmnpqrstvwxyz'

def count_syllables(text):

    total_syllables = 0
    monosyl_count = 0
    polysyl_count = 0
    
    tokens = word_tokenize(text)

    for token in tokens:
        syllable_count = 0
        for char in token:
            if char.lower() in vowels:
                total_syllables += 1
                syllable_count += 1
        
        # edge cases
        if token == 'ng' or token == 'mga': # edge case ng, mga
            total_syllables += 1
            syllable_count += 1
        
        elif (('io') in token): # edge case -io in names/surnames
            total_syllables -= 1
            syllable_count -= 1
            
        if syllable_count == 1:
            monosyl_count += 1
        elif syllable_count > 1:
            polysyl_count += 1

    return total_syllables, monosyl_count, polysyl_count

# def main():
#     total_syllables = count_syllables(clean_input)

#     print(f"Total syllables in the text file: {total_syllables}")
#     print(f"Number of monosyllabic words: {monosyl_count}")
#     print(f"Number of polysyllabic words: {polysyl_count}")

# if __name__ == "__main__":
#     main()

#### Word Frequency
> _Outputs will be placed in the 'word-freq output' folder

In [None]:

# # Read the text file
# with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
#     text = file.read()

def word_freq(path, filename):
    stop_words = set(stopwords('tl'))
    
    with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
        
        text = file.read()
        text_tokens = word_tokenize(text)
        filtered_tokens = [word.lower() for word in text_tokens if word.lower() not in stop_words] #removes stopwords
        text_tokens = [word for word in filtered_tokens if word.isalnum()] # removes punctuation marks
        fdist = FreqDist(text_tokens)

        # Create a DataFrame from the frequency distribution
        df_fdist = pd.DataFrame.from_dict(fdist, orient='index', columns=['Frequency'])
        df_fdist.index.name = 'Word'

        # Sort the DataFrame by frequency in descending order
        df_fdist_sorted = df_fdist.sort_values(by='Frequency', ascending=False)

        #print(df_fdist_sorted)


        out_path = path.replace('/clean-txt', '/word-freq output')
        out_filename = "[wordfreq] " + filename.removesuffix('_cleaned.txt') + ".csv"
        df_fdist_sorted.to_csv(os.path.join(out_path, out_filename), encoding='utf-8')

        #load in the dataframe
        df = pd.read_csv(os.path.join(out_path, out_filename), index_col=0)
        df.head(500)

        wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10)
        wordcloud.generate(' '.join(text_tokens))

        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()

        wordcloud.to_file(out_path + "/wordcloud/" + filename.removesuffix('_cleaned.txt') + ".png")


***
***
## Lexical Text Features

#### Import Libraries & POS Tagger Set-up

In [None]:


# #SET UP FOR POS TAGGER
# # input local path to java.exe
# java_path = "C:/Program Files/Java/jre1.8.0_341/bin/java.exe" 
# os.environ["JAVAHOME"] = java_path

# #path to POS tagger jar
# jar_path = os.getcwd()
# jar =  jar_path + "/stanford-postagger.jar"

# # path to POS tagger model
# model_path = jar_path +"/POSTagger/"
# model = model_path + "filipino-left5words-owlqn2-distsim-pref6-inf2.tagger"

# pos_tagger = StanfordPOSTagger(model, jar, encoding = "utf-8")

#### Input Tokenization & POS Tagging

In [None]:
# with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
#     text = file.read()

# words = nltk.word_tokenize(text)

# #tag tokenized words
# tagged_words = pos_tagger.tag(words)

#### Noun-Token Ratio

In [None]:
# # NOUN COUNT
# noun_count = 0
# for word, tag in tagged_words:
#     tag = tag.split('|')[-1] #removes word before |
#     if tag.startswith('NN'):
#         noun_count += 1
    
# print("Number of nouns: ", noun_count)

# # NOUN TOKEN RATIO
# # = noun_count/total_token_count
# total_token_count = len(words)
# noun_token_ratio = noun_count/total_token_count

# print("Total number of tokens: ", total_token_count)
# print("Noun-Token Ratio: ", noun_token_ratio)

def ntr(words, tagged):

    # NOUN COUNT
    noun_count = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('NN'):
            noun_count += 1
        
    # print("Number of nouns: ", noun_count)

    # NOUN TOKEN RATIO
    # = noun_count/total_token_count
    total_token_count = len(words)
    noun_token_ratio = noun_count/total_token_count

    return noun_token_ratio

# print("Total number of tokens: ", total_token_count)
# print("Noun-Token Ratio: ", noun_token_ratio)

#### Verb-Token Ratio

In [None]:
def vtr(words, tagged):
    # VERB COUNT
    verb_count = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('VB'):
            verb_count += 1
            
    # print("Number of verbs: ", verb_count)

    # VERB TOKEN RATIO
    # = verb_count/total_token_count
    total_token_count = len(words)
    verb_token_ratio = verb_count/total_token_count

    return verb_token_ratio

# print("Total number of tokens: ", total_token_count)
# print("Noun-Token Ratio: ", verb_token_ratio)

#### Type-Token Ratio

In [None]:
def ttr(words, tagged):
# count unique lexical categories
    unique_categories = set()
    for _, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if len(tag) >= 2:  # make sure the tag is not empty
            category = tag[:2]  # extract the first two letters
            unique_categories.add(category)

    # print("Unique Categories:", unique_categories)

    #NUMBER OF UNIQUE CATEGORIES
    num_categories = len(unique_categories)
    # print("Number of Unique Categories:", num_categories)

    # TOTAL NUM OF TOKENS
    total_token_count = len(words)

    # TYPE TOKEN RATIO
    ttr = num_categories/total_token_count
    # print("Type-Token Ratio: ", ttr)

    #ROOT TTR
    root_ttr = num_categories/math.sqrt(total_token_count)
    # print("Root Type-Token Ratio: ", root_ttr)

    #CORR TTR
    corr_ttr = num_categories/math.sqrt(2*total_token_count)
    # print("Corrected Type-Token Ratio: ", corr_ttr)

    #BILOGARITHMIC TTR
    denominator = math.log(total_token_count)

    if denominator == 0:
        log_ttr = 0
    else:
        log_ttr = math.log(num_categories)/math.log(total_token_count)
    # print("Bilogarithmic Type-Token Ratio: ", log_ttr)

    return ttr, root_ttr, corr_ttr, log_ttr

#### Lexical Density

In [None]:

def lexical_density(words, tagged):

    # NUMBER OF LEXICAL WORDS
    # count number of nouns, verbs, adjectives, and adverbs
    num_lexwords = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ') or tag.startswith('RB'):
            num_lexwords += 1
            
    # print("Number of lexical words: ", num_lexwords)

    # LEXICAL DENSITY
    # = lex_density/total_token_count
    total_token_count = len(words)
    lex_density = num_lexwords/total_token_count

    return lex_density

# print("Total number of tokens: ", total_token_count)
# print("Lexical Density: ", lex_density)

#### Foreign Word-Token Ratio

In [None]:
def fwtr(words, tagged):
    # FOREIGN WORD COUNT
    fw_count = 0
    for word, tag in tagged:
        tag = tag.split('|')[-1] #removes word before |
        if tag.startswith('FW'):
            fw_count += 1
            
    # print("Number of foreign words: ", fw_count)

    # FOREIGN WORD - TOKEN RATIO
    # = fw_count/total_token_count
    total_token_count = len(words)
    fw_token_ratio = fw_count/total_token_count

    return fw_token_ratio

# print("Total number of tokens: ", total_token_count)
# print("Foreign Word-Token Ratio: ", fw_token_ratio)

# Extract Features

#### Book-Level Input

In [None]:

path = curr_path + '/clean-txt'
os.chdir(path)

csv_header = ['Book Title', 'Word Count', 'Sentence Count', 'AVG Word Length', 'AVG Sentence Length', 'Total Syllables', 'MONOSYLL', 'POLYSYLL', 'NTR', 'VTR', 'TTR', 'Root TTR', 'Corrected TTR', 'BiLog TTR', 'LD', 'FWTR', 'MIN', 'MAX']
data = []

def read_text_file(file_path):
    
    path, file_name = os.path.split(file_path)
    suffix = '_cleaned.txt'
    if file_name.lower().endswith(suffix.lower()):
        file_name = file_name[: -len(suffix)]   # e.g. Tahan na Tahanan

    with open(file_path, 'r', encoding='utf-8-sig') as file:
        
        
        print('-------------------------------------------------')
        print(file_name)
        print('-------------------------------------------------')
        
        sentence = 0
        min = 0
        max = 0
        for line in file:

            
            # added the age at Line 0 of each sentence token script          
            if sentence == 0:
                age = line.strip()  # output: X-Y
                
                min_str, max_str = age.split("-")

                min = int(min_str)
                max = int(max_str)

                sentence += 1
                continue

            csv_data = [file_name]

            # print('Sentence ', sentence)

            # TRAD
            # print('WORD COUNT: ', word_count(line))
            csv_data.append(word_count(line))
            
            # print('SENTENCE COUNT: ', sentence_count(line))
            csv_data.append(sentence_count(line))

             # print('AVG WORD LENGTH: ', avg_word_length(line))
            csv_data.append(avg_word_length(line))
            
            # print('AVG SENTENCE LENGTH: ', avg_sent_length(line))
            csv_data.append(avg_sent_length(line))

            # print('TOTAL SYLLABLES: ', count_syllables(line)[0])
            # print('MONOSYLLABIC: ', count_syllables(line)[1])
            # print('POLYSYLLABIC: ', count_syllables(line)[2])
            csv_data.append(count_syllables(line)[0])
            csv_data.append(count_syllables(line)[1])
            csv_data.append(count_syllables(line)[2])


            # LEX
            wordsss = nltk.word_tokenize(line)
            tagged_words = pos_tagger.tag(wordsss)
            # print('NTR: ', ntr(wordsss, tagged_words))
            csv_data.append(ntr(wordsss, tagged_words))

            # print('VTR: ', vtr(wordsss, tagged_words))
            csv_data.append(vtr(wordsss, tagged_words))

            # print('TTR: ', ttr(wordsss, tagged_words)[0])
            # print('Root-TTR: ', ttr(wordsss, tagged_words)[1])
            # print('Corrected-TTR: ', ttr(wordsss, tagged_words)[2])
            # print('Bilogarithmic-TTR: ', ttr(wordsss, tagged_words)[3])
            csv_data.append(ttr(wordsss, tagged_words)[0])
            csv_data.append(ttr(wordsss, tagged_words)[1])
            csv_data.append(ttr(wordsss, tagged_words)[2])
            csv_data.append(ttr(wordsss, tagged_words)[3])

            # print('Lexical Density: ', lexical_density(wordsss, tagged_words))
            csv_data.append(lexical_density(wordsss, tagged_words))

            # print('FWTR: ', fwtr(wordsss, tagged_words))
            csv_data.append(fwtr(wordsss, tagged_words))

            csv_data.append(min)
            csv_data.append(max)
            data.append(csv_data)
            print(data)

    return '0'

for file in os.listdir(path):
    if file.endswith('.txt'):
        file_path = f'{path}/{file}'
        read_text_file(file_path)

main = curr_path
os.chdir(main)

df = pd.DataFrame(data, columns = csv_header)
df.to_csv('book.csv')

#### Word Cloud

In [None]:
#For wordcloud
curr_path = os.getcwd().replace('\clean-txt', '')

clean_txt_path = curr_path + '/clean-txt'
os.chdir(curr_path)
print(curr_path)

for file in os.listdir(clean_txt_path):
    if file.endswith('.txt'):
        word_freq(clean_txt_path, file)