In [1]:
import re, io, os
import urllib.request
from collections import defaultdict
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from prince import MCA

In [None]:
def tokenize(data):
	index = defaultdict(str)

	with open(data) as file:
		for line in file.read().splitlines():
			# pull out takahashi lines
			m = re.match(r'^<(f.*?)\..*;H> +(\S.*)$', line)
			if not m:
				continue

			transcription = m.group(2)
			pg = str(m.group(1))

			# ignore entire line if it has a {&NNN} or {&.} code
			if re.search(r'\{&(\d|\.)+\}', transcription):
				continue

			# remove extraneous chracters ! and %
			s = transcription.replace("!", "").replace("%", "")

			# delete all end of line {comments} (between one and three observed)
			# ...with optional line terminator
			# allow 0 occurences to remove end-of-line markers (- or =)
			s = re.sub(r'([-=]?\{[^\{\}]+?\}){0,3}[-=]?\s*$', "", s)

			# delete start of line {comments} (single or double)
			s = re.sub(r'^(\{[^\{\}]+?\}){1,2}', "", s)

			# simplification: tags preceeded by -= are word breaks
			s = re.sub(r'[-=]\{[^\{\}]+?\}', '.', s)

			# these tags are nulls
			# plant is a null in one case where it is just {plant}
			# otherwise (above) it is a word break
			# s = re.sub(r'\{(fold|crease|blot|&\w.?|plant)\}', "", s)
			# simplification: remaining tags in curly brackets
			s = re.sub(r'\{[^\{\}]+?\}', '', s)

			# special case .{\} is still a word break
			s = re.sub(r'\.\{\\\}', ".", s)

			# split on word boundaries
			# exclude null words ('')
			words = [str(w) for w in s.split(".") if w]
			paragraph = ' '.join(words).lstrip()

			index[pg] += (paragraph)

	return index

In [3]:
fpath = "/Users/katecastillo/Documents/MSDS/voynich-manuscript/takahashi_transcription.txt"

index = tokenize(fpath)
corpus = [index[key] for key in index.keys()]

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2)
tfidf_matrix = vectorizer.fit_transform(corpus)
vms_mapping = [k for k in index.keys()]

In [4]:
n_components = 6
nmf_model = NMF(n_components=n_components, random_state=42)
nmf_features = nmf_model.fit_transform(tfidf_matrix)



In [5]:
words = vectorizer.get_feature_names_out()
for index, topic in enumerate(nmf_model.components_):
    print(f"Topic {index}:")
    print([words[i] for i in topic.argsort()[-10:]])

Topic 0:
['qokal', 'shey', 'qokeey', 'qol', 'ol', 'qokeedy', 'qokain', 'qokedy', 'chedy', 'shedy']
Topic 1:
['cthor', 'shor', 'shy', 'sho', 'chy', 'shol', 'cthy', 'chor', 'chol', 'daiin']
Topic 2:
['qotaiin', 'chedy', 'qokeey', 'okaiin', 'otaiin', 'ar', 'qokeedy', 'al', 'aiin', 'qokaiin']
Topic 3:
['sheol', 'ckhey', 'chol', 'or', 'daiin', 'qokeol', 'cheor', 'cheol', 'ol', 'okeol']
Topic 4:
['ol', 'or aiin', 'qokar', 'dar', 'okar', 'ar', 'chedy', 'chdy', 'aiin', 'or']
Topic 5:
['otey', 'air', 'dar', 'oteody', 'oteey', 'aiin', 'dal', 'oteos', 'ar', 'al']


In [6]:
topic_distribution = pd.DataFrame(nmf_features, columns=[f"Topic {i+1}" for i in range(n_components)], index=vms_mapping)
topic_distribution

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
f1r,0.000000,0.240193,0.000000,0.050071,0.061804,0.039797
f1v,0.000000,0.142624,0.000000,0.105332,0.014035,0.000000
f2r,0.015502,0.169788,0.000000,0.000000,0.000000,0.031498
f2v,0.000000,0.208955,0.000000,0.047633,0.000000,0.000000
f3r,0.000000,0.144002,0.000000,0.134691,0.023631,0.000000
...,...,...,...,...,...,...
f114r,0.000000,0.028796,0.286732,0.022092,0.157699,0.011514
f114v,0.000000,0.024838,0.345465,0.016987,0.048828,0.000000
f115r,0.054167,0.021303,0.322326,0.066719,0.014427,0.020465
f115v,0.046498,0.018960,0.305704,0.000000,0.013559,0.000000


In [7]:
davis_df = pd.read_csv("data/davis_attribution.csv", index_col="folio")
davis_df

Unnamed: 0_level_0,topic,hand,language,quire
folio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f1r,botanical,1,a,1
f1v,botanical,1,a,1
f2r,botanical,1,a,1
f2v,botanical,1,a,1
f3r,botanical,1,a,1
...,...,...,...,...
f114r,starred paragraphs,3,b,18
f114v,starred paragraphs,3,b,18
f115r,starred paragraphs,3,b,18
f115v,starred paragraphs,3,b,18


In [8]:
dominant_topics = np.argmax(nmf_features, axis=1)
dominant_topics

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       3, 1, 1, 1, 0, 4, 1, 1, 1, 1, 1, 1, 3, 1, 4, 5, 1, 1, 4, 4, 4, 4,
       1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 0, 3, 1, 1, 4, 4, 1, 1, 1, 1,
       4, 4, 1, 1, 4, 4, 1, 1, 4, 4, 3, 3, 1, 1, 1, 1, 3, 3, 4, 4, 1, 1,
       3, 5, 5, 2, 4, 5, 0, 4, 5, 5, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 1, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0])

In [9]:
output_df = davis_df.copy()[["language", "topic", "hand"]]
output_df["topic_nmf"] = dominant_topics

output_df

Unnamed: 0_level_0,language,topic,hand,topic_nmf
folio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f1r,a,botanical,1,1
f1v,a,botanical,1,1
f2r,a,botanical,1,1
f2v,a,botanical,1,1
f3r,a,botanical,1,1
...,...,...,...,...
f114r,b,starred paragraphs,3,2
f114v,b,starred paragraphs,3,2
f115r,b,starred paragraphs,3,2
f115v,b,starred paragraphs,3,2


In [10]:
mca_list = []

for col in output_df.columns:
    mid_df = pd.DataFrame(output_df[col])

    mca = MCA(n_components=2, n_iter=4, copy=True, check_input=True, random_state=42)
    mca = mca.fit_transform(mid_df)

    mca["hue"] = col

    mca_list.append(mca)

In [11]:
output_df.columns

Index(['language', 'topic', 'hand', 'topic_nmf'], dtype='object')

In [12]:
colors = {
    "language": "tab:green",
    "topic": "tab:blue",
    "hand": "tab:orange",
    "topic_nmf": "tab:red",
}

In [13]:
mca = MCA(n_components=2, n_iter=4, copy=True, check_input=True, random_state=42)
mca = mca.fit(output_df)

In [14]:
mca.plot(
    output_df,
    x_component=0,
    y_component=1,
    show_column_labels=True,
    show_column_markers=True,
    
)