In [1]:
!pip install wordfreq
import math
from wordfreq import word_frequency

# Word cost using -log(prob) with strong penalty for unknowns
def word_cost(word):
    prob = word_frequency(word, 'en')
    if prob > 1e-9:
        return -math.log(prob)
    else:
        return 30 + len(word)  # strong penalty for unknown word length

# Viterbi-based word segmentation
def segment(text):
    n = len(text)
    cost = [0.0] * (n + 1)
    backtrace = [0] * (n + 1)

    for i in range(1, n + 1):
        candidates = []
        for j in range(max(0, i - 20), i):
            word = text[j:i]
            c = cost[j] + word_cost(word)
            candidates.append((c, j))
        cost[i], backtrace[i] = min(candidates)

    # Reconstruct the best segmentation
    words = []
    i = n
    while i > 0:
        j = backtrace[i]
        words.append(text[j:i])
        i = j
    return words[::-1]

# Test input
text = "thelongestlistofthelongeststuffatthelongestdomainnameatlonglast.com"
print(segment(text))

Collecting wordfreq
  Downloading wordfreq-3.1.1-py3-none-any.whl.metadata (27 kB)
Collecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting locate<2.0.0,>=1.1.1 (from wordfreq)
  Downloading locate-1.1.1-py3-none-any.whl.metadata (3.9 kB)
Downloading wordfreq-3.1.1-py3-none-any.whl (56.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading locate-1.1.1-py3-none-any.whl (5.4 kB)
Installing collected packages: locate, ftfy, wordfreq
Successfully installed ftfy-6.3.1 locate-1.1.1 wordfreq-3.1.1
['the', 'longest', 'list', 'of', 'the', 'longest', 'stuff', 'at', 'the', 'longest', 'domain', 'name', 'at', 'long', 'last', '.com']


In [2]:
print(segment(text))

['the', 'longest', 'list', 'of', 'the', 'longest', 'stuff', 'at', 'the', 'longest', 'domain', 'name', 'at', 'long', 'last', '.com']
