In [None]:
import math
import urllib.request
import json

In [89]:
# Create a unigram model (word frequencies) using a public dataset

def load_unigram_model():
    """
    Load word frequencies from Peter Norvig's compilation of Google Books Ngram data
    Source: https://norvig.com/ngrams/count_1w.txt
    Format: word\tcount
    """
    url = "https://norvig.com/ngrams/count_1w.txt"
    
    # Download the word frequency data
    response = urllib.request.urlopen(url)
    words_data = response.read().decode('utf-8').splitlines()
    
    # Parse into word: frequency dictionary
    word_frequencies = {}
    total_words = 0
    
    for line in words_data:
        word, count = line.strip().split('\t')
        count = int(count)
        word_frequencies[word] = count
        total_words += count
    
    # Convert to log probabilities
    word_prob = {}
    for word, count in word_frequencies.items():
        word_prob[word] = math.log(count / total_words)
    
    # Add a small probability for unknown words
    word_prob['<UNK>'] = math.log(1 / (total_words))
    
    return word_prob

In [91]:
# Dynamic Word Segmentation Algorithm

def segment_words(text, word_prob):
    # Initialize variables
    n = len(text)
    best_segment = [0] * (n + 1)  # Stores the index where the best segment ends
    best_score = [float('inf')] * (n + 1)  # Stores the best score for each position
    best_score[0] = 0  # The start of the text has zero cost

    # Dynamic Programming Loop
    for i in range(1, n + 1):
        # Check substrings ending at position i, limiting the substring length to 20
        for j in range(max(0, i - 20), i):
            word = text[j:i]  # Extract substring
            if word in word_prob:  # If the substring is a valid word
                score = best_score[j] - word_prob[word]  # Calculate score (lower is better)
                if score < best_score[i]:  # Update if this segmentation is better
                    best_score[i] = score
                    best_segment[i] = j

    # Backtrack to reconstruct the segmented words
    words = []
    i = n
    while i > 0:
        j = best_segment[i]
        words.append(text[j:i])
        i = j

    return words[::-1]  # Reverse the list since we constructed it backwards

In [95]:
word_model = load_unigram_model()

# Input URL

url = "thelongestlistofthelongeststuffatthelongestdomainnameatlonglast.com"
    
# Perform word segmentation

result = segment_words(text, word_model)
print(f"Result: {result}")

Result: ['the', 'longest', 'list', 'of', 'the', 'longest', 'stuff', 'at', 'the', 'longest', 'domainname', 'at', 'long', 'last']
