# Read Raw Text

In [1]:
import os

In [3]:
RESOURCE_DIR = "Resources"
HARRY_POTTER_SS_FILE = "Harry_Potter_and_Sorcerer's_Stone.txt"
FILE_PATH = os.path.join(RESOURCE_DIR, HARRY_POTTER_SS_FILE)

In [4]:
with open(FILE_PATH, 'r', encoding='utf-8') as file:
    raw_text = file.read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x93 in position 17968: invalid start byte

In [5]:
with open(FILE_PATH, 'r', encoding='windows-1252') as file:
    raw_text = file.read()

In [6]:
print(f"Length of text: {len(raw_text)} characters")
print(f"First 100 characters:\n{raw_text[:100]}")

Length of text: 442745 characters
First 100 characters:
Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of n


# Python's Regular Expression Library

In [7]:
import re

In [8]:
sample_text = "Hello, world! This is a test-text with punctuation."

# split by whitespace alone
sample_tokens_whitespace = re.split(r'(\s)', sample_text)

print("Tokens split by whitespace:\n", sample_tokens_whitespace)

Tokens split by whitespace:
 ['Hello,', ' ', 'world!', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test-text', ' ', 'with', ' ', 'punctuation.']


In [12]:
# split by whitespace, comma, period, and exclamation mark
sample_tokens_punct = re.split(r'([.,!]|\s)', sample_text)

print("Tokens split by whitespace and punctuation:\n", sample_tokens_punct)

Tokens split by whitespace and punctuation:
 ['Hello', ',', '', ' ', 'world', '!', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test-text', ' ', 'with', ' ', 'punctuation', '.', '']


In [13]:
cleaned_tokens = [token for token in sample_tokens_punct if token.strip()]

print("Cleaned Tokens:\n", cleaned_tokens)

Cleaned Tokens:
 ['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test-text', 'with', 'punctuation', '.']


In [14]:
# more splitters
more_splitters = r"([.,!?\-;:\"'(){}]|--|\s)"
all_tokens = re.split(more_splitters, sample_text)
cleaned_all_tokens = [token.strip() for token in all_tokens if token.strip()]

print("All Tokens with more splitters:\n", cleaned_all_tokens)

All Tokens with more splitters:
 ['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', '-', 'text', 'with', 'punctuation', '.']


# Split Raw Text and Clean

In [15]:
pre_processed_text = re.split(more_splitters, raw_text)
tokens = [token.strip() for token in pre_processed_text if token.strip()]

print(f"Total number of tokens: {len(tokens)}")
print(f"First 20 tokens:\n{tokens[:20]}")

Total number of tokens: 103821
First 20 tokens:
['Harry', 'Potter', 'and', 'the', 'Sorcerer', "'", 's', 'Stone', 'CHAPTER', 'ONE', 'THE', 'BOY', 'WHO', 'LIVED', 'Mr', '.', 'and', 'Mrs', '.', 'Dursley']


# Create Vocabulary and IDs

In [18]:
sorted_unique_tokens = sorted(list(set(tokens)))

print(f"Vocabulary size: {len(sorted_unique_tokens)}")

print(f"First 30 vocabulary tokens:\n{sorted_unique_tokens[:30]}")

print(f"Last 20 vocabulary tokens:\n{sorted_unique_tokens[-20:]}")

Vocabulary size: 6669
First 30 vocabulary tokens:
['!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '1473', '1637', '17', '1709', '1945', '2', '3', '31', '382', '4', '90', ':', ';', '?', 'A', 'AAAAAAAAAARGH', 'AAAARGH', 'ALBUS', 'ALL', 'ALLEY']
Last 20 vocabulary tokens:
['yesterday', 'yet', 'you', 'young', 'younger', 'youngest', 'youngsters', 'your', 'yours', 'yourself', 'yourselves', 'youth', 'zigzagging', 'zombie', 'zoo', 'zoom', 'zoomed', 'zooming', '–', '“']


In [20]:
vocab = {token:id for id, token in enumerate(sorted_unique_tokens)}

In [None]:
print(f"Token to ID mapping for first 30 tokens:")
for token in sorted_unique_tokens[:30]:
    print(f"'{token}': {vocab[token]}")

Token to ID mapping for first 30 tokens:
'!': 0
'"': 1
''': 2
'(': 3
')': 4
',': 5
'-': 6
'.': 7
'0': 8
'1': 9
'1473': 10
'1637': 11
'17': 12
'1709': 13
'1945': 14
'2': 15
'3': 16
'31': 17
'382': 18
'4': 19
'90': 20
':': 21
';': 22
'?': 23
'A': 24
'AAAAAAAAAARGH': 25
'AAAARGH': 26
'ALBUS': 27
'ALL': 28
'ALLEY': 29


In [22]:
print(f"Token to ID mapping for last 20 tokens:")
for token in sorted_unique_tokens[-20:]:
    print(f"'{token}': {vocab[token]}")

Token to ID mapping for last 20 tokens:
'yesterday': 6649
'yet': 6650
'you': 6651
'young': 6652
'younger': 6653
'youngest': 6654
'youngsters': 6655
'your': 6656
'yours': 6657
'yourself': 6658
'yourselves': 6659
'youth': 6660
'zigzagging': 6661
'zombie': 6662
'zoo': 6663
'zoom': 6664
'zoomed': 6665
'zooming': 6666
'–': 6667
'“': 6668


# Assign Token IDs

In [23]:
tokenized_output = [vocab[token] for token in tokens]

print(f"Total number of token IDs: {len(tokenized_output)}")

print(f"First 20 token IDs:\n{tokenized_output[:20]}")

print(f"Last 20 token IDs:\n{tokenized_output[-20:]}")

Total number of token IDs: 103821
First 20 token IDs:
[571, 967, 1527, 5996, 1150, 2, 5067, 1176, 197, 872, 1206, 102, 1335, 696, 811, 7, 1527, 812, 7, 350]
Last 20 token IDs:
[2, 4049, 3330, 6088, 3482, 1431, 4017, 4390, 3208, 6560, 344, 6024, 5825, 7, 7, 7, 7, 1, 1206, 354]
