# Compiled Scripts for Text Feature Extraction


> _Click 'Run All' to extract the text features of a text file._


***

#### Import necessary packages

In [125]:
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
import string
import syllables
import pandas as pd

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

***

### Input Filename

In [126]:
path = os.getcwd() + "/clean-txt/"      # gets path to 'clean-txt' directory
filename = input("Input filename (must be in the clean-txt folder): ")

### Word Counter

In [127]:
file = open(path + filename, "rt")
data = file.read()
words = data.split()

print("Number of Words: ", len(words))

Number of Words:  973


### Sentence Counter

In [128]:
folder = nltk.data.find(path)
corpusReader = nltk.corpus.PlaintextCorpusReader(folder, filename)

print("Number of Sentences: ", len(corpusReader.sents()))

Number of Sentences:  85


### Average Word Length

In [129]:
with open(path + "/" + filename, 'r') as file:
    word_length = [len(word) for line in file for word in line.rstrip().split(" ")]
    word_avg = sum(word_length)/len(word_length)
    
print("Average Word Length: ", word_avg, "letters")

Average Word Length:  5.216380182002022 letters


### Average Sentence Length

In [130]:
folder = nltk.data.find(path)
corpusReader = nltk.corpus.PlaintextCorpusReader(folder, filename)

# SOURCE: https://stackoverflow.com/questions/35900029/average-sentence-length-for-every-text-in-corpus-python3-nltk
avg = sum(len(sent) for sent in corpusReader.sents()) / len(corpusReader.sents())
print("Average Sentence Length: ", avg, "words")

Average Sentence Length:  13.823529411764707 words


### Total Syllables

In [131]:
# https://github.com/itudidyay/Tagalog-Word-Syllabization-Python
# https://pypi.org/project/syllables/

vowels = 'aeiou'
consonants = 'bcdfghjklmnpqrstvwxyz'

total_syllables = 0

def count_syllables(text):
    global total_syllables
    tokens = word_tokenize(text)

    for token in tokens:
        for char in token:
            if char in vowels:
                total_syllables += 1
        
        # edge cases
        if token == 'ng' or token == 'mga': # edge case ng, mga
            total_syllables += 1
        
        elif (('io') in token): # edge case -io in names/surnames
            total_syllables -= 1

    return total_syllables

def main():

    # Read the text file
    with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
        text = file.read()

    total_syllables = count_syllables(text)

    print(f"Total syllables in the text file: {total_syllables}")

if __name__ == "__main__":
    main()

Total syllables in the text file: 2096


### Word Frequency

> _Outputs will be placed in the 'word-freq output' folder

In [132]:

# Read the text file
with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
    text = file.read()

temp_tokens = word_tokenize(text)
text_tokens = [word for word in temp_tokens if word.isalnum()] # removes punctuation marks
fdist = FreqDist(text_tokens)

# Create a DataFrame from the frequency distribution
df_fdist = pd.DataFrame.from_dict(fdist, orient='index', columns=['Frequency'])
df_fdist.index.name = 'Word'

# Sort the DataFrame by frequency in descending order
df_fdist_sorted = df_fdist.sort_values(by='Frequency', ascending=False)

print(df_fdist_sorted)

out_path = os.getcwd() + "/word-freq output"
out_filename = "[wordfreq] " + filename.removesuffix('_cleaned.txt') + ".csv"
df_fdist_sorted.to_csv(os.path.join(out_path, out_filename), encoding='utf-8')


                 Frequency
Word                      
ng                      61
ang                     60
sa                      54
mga                     39
na                      38
...                    ...
binti                    1
pilay                    1
8                        1
jinky                    1
nangangailangan          1

[429 rows x 1 columns]
