In [1]:
from morfessor import BaselineModel, MorfessorIO

In [2]:
with open('isc_words.txt', 'r') as f:
    with open('isc_words_lower.txt', 'w') as f2:
        for line in f:
            f2.write(line.lower())

In [16]:
import random

# Load the word corpus (isc_words_lower)
corpus_file_path = './isc_words_lower.txt'
with open(corpus_file_path, 'r', encoding='utf-8') as file:
    word_corpus = [line.strip() for line in file.readlines() if line.strip()]

# Split the corpus into train and test sets (80% train, 20% test)
random.shuffle(word_corpus)  # Shuffle the corpus to ensure random splitting
split_index = int(len(word_corpus) * 0.8)  # 80% for training
train_set = word_corpus[:split_index]
test_set = word_corpus[split_index:]

# Write the train and test sets to new files
train_file_path = './train_set.txt'
test_file_path = './test_set.txt'

with open(train_file_path, 'w', encoding='utf-8') as train_file:
    for word in train_set:
        train_file.write(word + '\n')

with open(test_file_path, 'w', encoding='utf-8') as test_file:
    for word in test_set:
        test_file.write(word + '\n')

print(f"Train set written to {train_file_path}")
print(f"Test set written to {test_file_path}")


Train set written to ./train_set.txt
Test set written to ./test_set.txt


In [17]:
# Initialize the Morfessor model
model = BaselineModel()
io_handler = MorfessorIO()

# Load the corpus data
data = io_handler.read_corpus_file("train_set.txt")


In [27]:
import math
def calculate_log_likelihood(model, test_data):
    total_log_prob = 0.0
    total_tokens = 0
    for word in test_data:
        # Segment the word
        segmentation = model.viterbi_segment(word)
        # Count tokens (morphemes) in the segmentation
        total_tokens += len(segmentation)
        # Calculate log-probability for each token
        word_log_prob = sum([model.forward_logprob(morpheme) for morpheme in segmentation])
        total_log_prob += word_log_prob
    # Calculate average log probability
    avg_log_prob = total_log_prob / total_tokens
    # Calculate perplexity
    perplexity = math.exp(-avg_log_prob)
    return perplexity

In [21]:
# Load the data into the model and train
model.load_data(data)
model.train_batch(finish_threshold=0.05)

...........................................................
...........................................................


(2, 61651.40449758041)

In [29]:
model = io_handler.read_binary_file("morf_isc_model.bin")

In [38]:
segmentation= model.segment("yoinai")
segmented_word = " ".join(segmentation)

print(segmented_word)

yoi n ai


In [6]:
io_handler.write_binary_model_file(file_name="morf_isc_model.bin", model=model)

In [7]:
segmentations = model.get_segmentations()
io_handler.write_segmentation_file(file_name="morf_isc_segm",segmentations=segmentations)

In [8]:
constructions = model.get_constructions()
io_handler.write_lexicon_file(file_name="morf_isc_lex", lexicon=constructions)