In [1]:
import pandas as pd

In [2]:
from typing import List

def load_text_file(file_path: str) -> List[str]:
    """Reads a text file and returns a list of lines."""
    print(f"Loading text file from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file.readlines()]

In [3]:
train = load_text_file('data/lm_data/treebank-sentences-train.txt')
test = load_text_file('data/lm_data/treebank-sentences-test.txt')
dev = load_text_file('data/lm_data/treebank-sentences-dev.txt')

Loading text file from data/lm_data/treebank-sentences-train.txt...
Loading text file from data/lm_data/treebank-sentences-test.txt...
Loading text file from data/lm_data/treebank-sentences-dev.txt...


In [5]:
print(f"Train: {len(train)} sentences")
print(f"Dev: {len(dev)} sentences")
print(f"Test: {len(test)} sentences")


Train: 36261 sentences
Dev: 4529 sentences
Test: 4554 sentences


In [7]:
def get_num_unique_tokens(sentences: List[str]) -> int:
    """Returns the number of unique tokens in a list of sentences."""
    tokens = set()
    for sentence in sentences:
        tokens.update(sentence.split())
    return len(tokens)

In [8]:
print(f"Train: {get_num_unique_tokens(train)} unique tokens")
print(f"Dev: {get_num_unique_tokens(dev)} unique tokens")
print(f"Test: {get_num_unique_tokens(test)} unique tokens")

Train: 32215 unique tokens
Dev: 11005 unique tokens
Test: 10483 unique tokens


In [14]:
def get_sentence_avg_length(sentences: List[str]) -> float:
    """Returns the average length of sentences in a list of sentences."""
    total_length = 0
    for sentence in sentences:
        total_length += len(sentence.split())
    return total_length / len(sentences)
def get_sentence_max_length(sentences: List[str]) -> int:
    """Returns the maximum length of sentences in a list of sentences."""
    max_length = 0
    for sentence in sentences:
        max_length = max(max_length, len(sentence.split()))
    return max_length

def get_sentence_min_length(sentences: List[str]) -> int:
    """Returns the minimum length of sentences in a list of sentences."""
    min_length = 100000
    for sentence in sentences:
        min_length = min(min_length, len(sentence.split()))
    return min_length   


In [15]:
print(f"Train: avg length: {get_sentence_avg_length(train)}, max length: {get_sentence_max_length(train)}")
print(f"Dev: avg length: {get_sentence_avg_length(dev)}, max length: {get_sentence_max_length(dev)}")
print(f"Test: avg length: {get_sentence_avg_length(test)}, max length: {get_sentence_max_length(test)}")
print(f"Train: min length: {get_sentence_min_length(train)}")
print(f"Dev: min length: {get_sentence_min_length(dev)}")
print(f"Test: min length: {get_sentence_min_length(test)}")

Train: avg length: 20.799260913929565, max length: 39
Dev: avg length: 20.705453742548023, max length: 39
Test: avg length: 20.691699604743082, max length: 39
Train: min length: 1
Dev: min length: 1
Test: min length: 1


In [17]:
def most_common_tokens(sentences: List[str], n: int) -> pd.DataFrame:
    """Returns a dataframe with the n most common tokens in a list of sentences."""
    tokens = {}
    for sentence in sentences:
        for token in sentence.split():
            if token in tokens:
                tokens[token] += 1
            else:
                tokens[token] = 1
    tokens = pd.DataFrame(tokens.items(), columns=['token', 'count'])
    return tokens.sort_values('count', ascending=False).head(n)

In [18]:
print(most_common_tokens(train, 10))

    token  count
8     the  34134
20     of  18792
50     to  18284
11      a  16016
31    and  14130
102    in  12584
116    's   7841
185   for   6709
84   that   6673
71    The   6204


In [21]:
def mode_length(sentences: List[str]) -> int:
    """Returns the mode length of sentences in a list of sentences."""
    lengths = {}
    for sentence in sentences:
        length = len(sentence.split())
        if length in lengths:
            lengths[length] += 1
        else:
            lengths[length] = 1
    return max(lengths, key=lengths.get)

In [22]:
print(f"Train: mode length: {mode_length(train)}")
print(f"Dev: mode length: {mode_length(dev)}")
print(f"Test: mode length: {mode_length(test)}")

Train: mode length: 20
Dev: mode length: 23
Test: mode length: 19


In [23]:
def most_common_tokens(sentences: List[str], n: int) -> pd.DataFrame:
    """Returns a dataframe with the n most common tokens in a list of sentences."""
    tokens = {}
    for sentence in sentences:
        for token in sentence.split():
            if token in tokens:
                tokens[token] += 1
            else:
                tokens[token] = 1
    tokens = pd.DataFrame(tokens.items(), columns=['token', 'count'])
    return tokens.sort_values('count', ascending=False).head(n)


In [25]:
print(f"Train: {most_common_tokens(train, 1)}")
print(f"Dev: {most_common_tokens(dev, 1)}")
print(f"Test: {most_common_tokens(test, 1)}")

Train:   token  count
8   the  34134
Dev:    token  count
10   the   4264
Test:    token  count
55   the   4400
