Zachary Proom

EN.605.646.81: Natural Language Processing

# Lab #4

In [20]:
## a

In [21]:
import re

# Define the tokenizer function.
def tokenize_text(input):
    """        
    Parameters
    ----------
    input : str
        The input text that gets tokenized.
    
    Returns
    -------
    An ordered list of normalized tokens.
    """
    
    # Add whitespace around punctuation. Exclude periods and hyphens for now
    # because they require extra care.
    input_nopunct = re.sub("(\.\.\.)", r" \1 ", input) # Ellipsis
    input_nopunct = re.sub("([^\w\d\.-])", r" \1 ", input_nopunct)

    # Add whitespace around a period only when it's at the end of a sentence.
    # To detect this, check if the following conditions are true:
    #     (1) There is whitespace after the period.
    #     (2) The period is at the end of a line.
    #     (3) There is a lowercase letter before the period, and an uppercase
    #         letter after the period.
    # In cases (1) and (2), the period cannot be preceded by a period because
    # it would be part of an ellipsis.
    input_nopunct = re.sub("(?<![\.])(\.)\s", r" \1 ", input_nopunct)
    input_nopunct = re.sub("(?<![\.])(\.)$", r" \1 ", input_nopunct)
    input_nopunct = re.sub("([a-z])(\.)([A-Z])", r"\1 \2 \3", input_nopunct)

    # Add whitespace around a hyphen except when it's used between letters 
    # (e.g. so-called).
    input_nopunct = re.sub(r'(?<![a-zA-Z])-(?![a-zA-Z])', ' - ', input_nopunct)
    
    # Lower-case all the words in the input text.
    input_lowercase = input_nopunct.lower()
    
    # Split all the words by whitespace.
    input_wssplit = input_lowercase.split()
    
    return input_wssplit

In [22]:
# Test the tokenizer function on the example sentence in the prompt.
tokenize_text("NAC has developed a National HIV/AIDS/STI/TB Intervention Strategic Plan (2002-2005) that aims to reduce the HIV prevalence rate among Zambians from 19.3% to 11.7% and improve the health status of people living with HIV/AIDS by 2005.")

['nac',
 'has',
 'developed',
 'a',
 'national',
 'hiv',
 '/',
 'aids',
 '/',
 'sti',
 '/',
 'tb',
 'intervention',
 'strategic',
 'plan',
 '(',
 '2002',
 '-',
 '2005',
 ')',
 'that',
 'aims',
 'to',
 'reduce',
 'the',
 'hiv',
 'prevalence',
 'rate',
 'among',
 'zambians',
 'from',
 '19.3',
 '%',
 'to',
 '11.7',
 '%',
 'and',
 'improve',
 'the',
 'health',
 'status',
 'of',
 'people',
 'living',
 'with',
 'hiv',
 '/',
 'aids',
 'by',
 '2005',
 '.']

I define a function above called tokenize_text(). It uses the re module to pad punctuation characters with spaces, so the string can be split on spaces. Before being split, the string is also converted to lowercase. The function adds whitespace around all punctuation, and it handles periods and hyphens with extra care. In particular, the function tries to only add whitespace around periods when it's at the end of a sentence. It also only tries to add whitespace around hyphens when they aren't between letters.

As you can see above, the function works on the test sentence. In my first attempt, I did not think about splitting around ellipses. I noticed my function didn't split ellipses on the first ten lines in tokens.txt, and I added more code to my function to handle them. I also didn't think about typos where there's no space between the last letter of a sentence and the first letter of the proceeding sentence (e.g. "ham.I do"). I had to add condition (3) in my function to deal with this correctly. My function still has trouble handling some less common initialisms like "U.S. of A". I'm unsure whether my tokenizer function should split contractions like "they're". Currently, it splits on the apostrophe.

Below I show the result of processing the first ten lines of tokens.txt.

In [23]:
with open("tokens.txt") as file:
    for i in range(10):
        line = next(file).strip()
        print(tokenize_text(line))

['russian', 'for', 'plastic', 'bag', 'is', 'полиэтиленовый', 'пакет', '.', '7.3', 'out', 'of', '10', 'statistics', 'is', 'made', 'up', '.', 'i', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '.', 'i', 'do']
['not', 'like', 'them', 'sam-i-am', '.', 'dr', '.', 'mulholland', 'lives', 'on', 'mulholland', 'dr', '.', 'in', 'hollywood', '.', '1', ',', '2', ',', '3', '...', 'slashdot.com', 'has', 'some', 'interesting']
['articles', '.', 'i', "'", 'm', 'going', 'to', 'update', 'my', 'resumé', '.', 'j.h.u', '.', 'has', 'a', 'great', 'la-crosse', 'team', '.', 'born', 'in', 'the', 'u.s', '.', 'of', 'a', '.', 'incorrect', 'plurala', 'can', 'be']
['fun', '.', 'is', 'capitalization', '(', 'sp', '?', ')', 'truly', 'necessary', '?', 'i', 'think', 'lower', 'case', 'is', 'more', 'legible', '.', 'when', 'people', 'write', 'in', 'all', 'caps', ',', 'it', 'feels']
['like', 'they', "'", 're', 'yelling', '!', 'it', 'is', 'precisely', 'to', 'these', 'substances', 'that', 'the', 'so-called', 'french', 'pa

## b

In [24]:
summary_info = {
    "n_lines": 0,
    "unique_tokens": set(),
    "vocab_size": 0,
    "collection_size": 0,
    "unique_token_counts": dict()
}

# Process all the lines in the text file.
with open("tokens.txt") as file:
    for line in file:
        summary_info["n_lines"] += 1
        result = tokenize_text(line)
        [summary_info["unique_tokens"].add(i) for i in result]
        summary_info["collection_size"] = summary_info["collection_size"] + len(result)
        for token in result:
            if token in summary_info["unique_token_counts"]:
                summary_info["unique_token_counts"][token] = summary_info["unique_token_counts"].get(token, 0) + 1 
            if token not in summary_info["unique_token_counts"]:
                summary_info["unique_token_counts"][token] = 1
    
summary_info["vocab_size"] = len(summary_info["unique_tokens"])

In [29]:
# Report number of lines processed.
print(summary_info["n_lines"])

# Report number of unique tokens.
print(summary_info["vocab_size"])

# Report collection size
print(summary_info["collection_size"])

944802
344521
23432250


In [26]:
# Sort the counts.
summary_info["unique_token_counts"] = dict(sorted(summary_info["unique_token_counts"].items(), key=lambda item: item[1], reverse = True))

# Find the most frequent types.
ranks_1_100 = {k: summary_info["unique_token_counts"][k] for k in list(summary_info["unique_token_counts"])[:100]}
rank_500_key = list(summary_info["unique_token_counts"].keys())[500]
rank_1000_key = list(summary_info["unique_token_counts"].keys())[1000]
rank_5000_key = list(summary_info["unique_token_counts"].keys())[5000]
rank_10000_key = list(summary_info["unique_token_counts"].keys())[10000]

rank_500_val = summary_info["unique_token_counts"][rank_500_key]
rank_1000_val = summary_info["unique_token_counts"][rank_1000_key]
rank_5000_val = summary_info["unique_token_counts"][rank_5000_key]
rank_10000_val = summary_info["unique_token_counts"][rank_10000_key]

rank_500 = dict()
rank_1000 = dict()
rank_5000 = dict()
rank_10000 = dict()

rank_500[rank_500_key] = rank_500_val
rank_1000[rank_1000_key] = rank_1000_val
rank_5000[rank_5000_key] = rank_5000_val
rank_10000[rank_10000_key] = rank_10000_val

In [27]:
# Report the most frequent types.
print(ranks_1_100)
print(rank_500)
print(rank_1000)
print(rank_5000)
print(rank_10000)

{'the': 1154788, '.': 1049089, ',': 1027875, 'to': 564440, 'and': 534802, 'of': 487282, 'a': 455057, 'in': 390653, 'for': 214839, 'that': 203046, 'is': 202438, 's': 187993, '’': 187233, 'on': 164662, 'it': 156335, 'with': 149567, "'": 141226, 'was': 130719, 'at': 127865, '"': 126344, 'i': 117749, 'as': 113449, 'be': 107919, '“': 106826, 'he': 104806, '”': 102222, 'are': 101822, 'said': 94457, 'this': 92847, ':': 92800, 'we': 91866, 'have': 91076, 'from': 89899, 'by': 88317, 'will': 80899, 'you': 80481, 'they': 76653, 'has': 73793, 'but': 71078, 'not': 69961, 'an': 68283, 'his': 67887, 'their': 60158, 'or': 56686, '-': 54567, 't': 52885, 'who': 52176, 'more': 49498, 'one': 48891, 'all': 48833, ')': 46853, 'she': 46822, '(': 46809, 'about': 45767, 'there': 45686, 'can': 44820, 'were': 44501, 'had': 43002, 'been': 42080, 'our': 40601, 'up': 40236, 'her': 39896, 'when': 39866, 'also': 39448, 'out': 39265, 'people': 38390, 'would': 37923, 'new': 36832, 'if': 36764, 'what': 36477, 'which': 3

In [28]:
# Find the number of singletons.
n_singletons = 0
for val in summary_info["unique_token_counts"].values():
    if val == 1:
        n_singletons += 1

# Report the number of tokens that occur once (singletons).
print(n_singletons)

198077


In [30]:
# Report the the percentage of the vocabulary that singletons constitute.
print(100*n_singletons/summary_info["vocab_size"])

57.49344742410477
