In [103]:
import nltk
import ssl
import urllib.request
import re
from itertools import chain
from utils.Speech_terms import speech_terms as st

speech_terms = st()
print(speech_terms._hesitation_words)

# download these if needed
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

('like', 'uh', 'so', 'um', 'actually', 'basically', 'just', 'right', 'seem', 'seriously', 'well', 'literally', 'totally')


In [139]:
# read in ASR model input info

time = 50000  # ms
f = open("./test_text.txt", "r")
text = f.read().lower()

In [140]:
# Schema for info recording
class text_schema:
    def __init__(self, time, sentences):

        self.word_frequency = {}         # record the frequency of each word
        self.time = time                 # ms
        self.sentences = len(sentences)  # number of sentences (defined)
        self.tokens = 0                  # number of tokens. The definition of tokens needs to be confirmed (defined)
        self.punctuations = 0            # number of punctuations (defined)
        self.words = 0                   # number of words (defined)
        self.lemmas = 0                  # number of lemmas -> unique words?
        self.lemma_rate = 0              # rate of lemma
        self.noun = 0                    # number of noun (defined)
        self.verb = 0                    # number of verb (defined)
        self.adj = 0                     # number of adjactive (defined)
        self.pron = 0                    # number of pronouns (defined)
        self.conj = 0                    # number of conjunctions (defined)
        self.noun_rate = 0               # rate of noun (defined)
        self.verb_rate = 0               # rate of verb (defined)
        self.adj_rate = 0                # rate of adjactive (defined)
        self.pron_rate = 0               # rate of pronouns (defined)
        self.conj_rate = 0               # rate of conjunctions (defined)

        self.hesitation = 0              # number of hesitation words (defined)
        self.hesitation_rate = 0         # rate of hesitation words (defined)
        # ---------------------- semantic features ----------------------
        self.uncertain = 0               # number of uncertain words 
        self.uncertain_rate = 0          # rate of uncertain words 
        self.memory = 0                  # number of memory words (defined)
        self.memory_rate = 0             # number of memory words (defined)
        self.negation = 0                # number of negation words 
        self.content_words = 0           # number of content words  
        self.function_words = 0          # number of function words 
        self.content_words_rate = 0      # rate of content words 
        self.function_words_rate = 0     # rate of function words 

        # Function words include determiners, conjunctions, prepositions, pronouns, auxiliary verbs, modals, qualifiers, and question words
        # auxiliary verbs, modals, qualifiers, and question words not implemented
        # Content words are words with specific meanings, such as nouns, adjectives, adverbs, and main verbs
        # 

    def count_word_and_punctuations(self, tokens):
        puncts = {".", ",", ":", ";", "'", '"'}
        for sentence in tokens:
            for i in sentence:
                if i in puncts:
                    self.punctuations += 1
                    self.tokens += 1
                else:
                    self.words += 1
                    self.tokens += 1

    def count_pos(self, pos):
        # number of POS
        for sentence in pos:
            for item in sentence:
                if (item[1] in {"NN", "NNS", "NNP", "NNPS"}): # noun
                    self.noun += 1
                    self.content_words += 1
                elif (item[1] in {"VBZ", "VBP", "VBN", "VBG", "VBD", "VB"}): # verb
                    self.verb += 1
                    self.content_words += 1
                elif (item[1] in {"JJ", "JJR", "JJS"}): # adjective
                    self.adj += 1
                elif (item[1] in {"PRP", "PRP$"}): # pronoun
                    self.pron += 1
                elif (item[1] in {"CC"}): # conjunction
                    self.conj += 1
                if (item[1] in {"NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB"}): # content words
                    self.content_words += 1
                if (item[1] in {"DT", "CC", "IN", "PRP", "PRP$"}): # function words
                    self.function_words += 1
        # rate of POS
        self.noun_rate = self.noun / self.tokens
        self.verb_rate = self.verb / self.tokens
        self.adj_rate = self.adj / self.tokens
        self.pron_rate = self.pron / self.tokens
        self.conj_rate = self.conj / self.tokens
        self.content_words_rate = self.content_words / self.tokens
        self.function_words_rate = self.function_words / self.tokens
    
    def text_word_frequency(self, tokens):
        for sentence in tokens:
            tmp = nltk.probability.FreqDist(sentence)
            new_dict = {k : v for k, v in tmp.items()}
            self.word_frequency = {key: new_dict.get(key, 0) + self.word_frequency.get(key, 0) for key in set(new_dict) | set(self.word_frequency)}

    def spontaneous_speech_features(self):
        # TODO: number and rate of hesitation, filled pause

        # count the number of hesitation words
        for i in speech_terms._hesitation_words:
            if i in self.word_frequency:
                self.hesitation += self.word_frequency[i]
            else:
                continue
        for i in speech_terms._hesitation_terms:
            term_num = text.count(i)
            self.hesitation += term_num

        # calculate the hesitation rate
        self.hesitation_rate = self.hesitation / self.tokens

    def semantic_features(self):
        # TODO: memory activity, negation words

        # count the number of hesitation words
        for i in speech_terms._memory_words:
            if i in self.word_frequency:
                self.memory += self.word_frequency[i]
            else:
                continue
        for i in speech_terms._memory_terms:
            term_num = text.count(i)
            self.memory += term_num

        # calculate the hesitation rate
        self.memory_rate = self.memory / self.tokens


In [141]:
# Process text file

sentences = nltk.sent_tokenize(text)
tokens = [nltk.tokenize.word_tokenize(token) for token in sentences]
pos = [nltk.pos_tag(token) for token in tokens]  # Part-of-speech tagging

transcript = text_schema(50000, sentences) # time needs to be obtained from data
transcript.text_word_frequency(tokens)
transcript.count_word_and_punctuations(tokens)
transcript.count_pos(pos)
transcript.spontaneous_speech_features()
transcript.semantic_features()

print(sentences)
print(tokens)
print(pos)
print(sorted(transcript.word_frequency.items(), key = lambda x: x[1], reverse=True))
print(transcript.tokens)
print(transcript.words)
print(transcript.punctuations)
print(transcript.noun)
print(transcript.noun_rate)
print(transcript.adj)
print(transcript.adj_rate)
print(transcript.verb)
print(transcript.verb_rate)
print(transcript.content_words)
print(transcript.content_words_rate)
print(transcript.function_words)
print(transcript.function_words_rate)
print("hes", transcript.hesitation)
print(transcript.hesitation_rate)
print("mem", transcript.memory)
print(transcript.memory_rate)

f.close()

['title: exploring the complex landscape of human memory\n\nintroduction:\n\nmemory is a fascinating and intricate aspect of human cognition, playing a pivotal role in shaping our identities, influencing decision-making, and allowing us to navigate the intricacies of daily life.', 'from the fleeting recollection of a recent conversation to the vivid recall of a cherished childhood memory, the processes that govern human memory are both diverse and dynamic.', 'types of memory:\n\nhuman memory is commonly classified into three main types: sensory memory, short-term memory, and long-term memory.', 'sensory memory is the initial stage where information is briefly registered through our senses, such as sight, hearing, and touch.', 'short-term memory, often referred to as working memory, retains information for a short duration, allowing us to manipulate and process it.', 'long-term memory is the repository of information that endures for an extended period, ranging from days to a lifetime.'

1. Morphological features
    Unanalyzed words not handled
    first persin singular verbs are not offered in nltk

2. Spontaneous speech-based features
    hesitation words definition needs some more research
    lengthened sounds, silent pauses not implemented, (need data team support)
    Difference b/w hesitation and filled pauses????

3. Semantic features
    negation words not implemented
    definition of content words and function words are not clear


Questions for data team
1. filled pause may be handled into a special word in data processing? lenghtened sounds are needed as well.