In [152]:
# @title Setup
!pip install -q logmap tomotopy bertopic pyLDAvis orjson

import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from google.colab import drive
from logmap import logmap
from functools import cached_property
import tomotopy as tp
# from bertopic import BERTopic
# from bertopic.representation import KeyBERTInspired
import pyLDAvis
import orjson
import re
import nltk
nltk.download('stopwords')
stopwords_path = os.path.join(root_folder, 'stopwords-2024-07-11.txt')
with open(stopwords_path, 'r') as f:
    stopwords = set(f.read().split())
stopwords |= set(nltk.corpus.stopwords.words('english'))

drive.mount('/content/drive')
root_folder = '/content/drive/MyDrive/BallitoreColab/'
# !ls {root_folder}
root_file = os.path.join(root_folder, 'ballitore_data.xlsx')
df=pd.read_excel(root_file).set_index('id').fillna('')

def tokenize(txt, stopwords=stopwords):
    def remove_bracket_text(text):
        return re.sub(r'\[[^\]]*\]', '', text)

    def is_stopword(word):
        return word.lower() in stopwords or len(word) < 4 or word.isdigit()

    tokens = re.findall(r"[\w']+|[.,!?; -—–\n]", remove_bracket_text(txt).lower())
    return [word for word in tokens if not is_stopword(word)]

df['is_journal']=df.box.apply(lambda x: x in {13,14})
df[df.is_journal]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0_level_0,sender,recipient,date,location from,location to,notes,datetime,dateyear,txt,box,supplemental_ids,is_journal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
consensus_text_90344821,,,,,,,NaT,0,Vol Ⅰ\nJuly 21st 1852\nOctober 15th 1852\n[ext...,14,,True
consensus_text_90344822,,,,,,,NaT,0,Cork - Patrick St\n1852. July 21st.. Up at\nsi...,14,,True
consensus_text_90344823,,,,,,,NaT,0,written about. So we all\npacked on an outside...,14,,True
consensus_text_90344824,,,,,,,NaT,0,Margaret had to leave at the\nHarris's for Dr ...,14,,True
consensus_text_90344825,,,,,,,NaT,0,when Margaret left the\nbook. And then home by...,14,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
mss4-b13-f5-066,,,,,,,NaT,0,"421\nafterwards a storm came on, which indeed\...",13,,True
mss4-b13-f5-067,,,,,,,NaT,0,422\nSarah Benson went to their mo: meetg.. at...,13,,True
mss4-b13-f5-068,,,,,,,NaT,0,"423\n2nd.. Left Liverpool, in company with\nTh...",13,,True
mss4-b13-f5-069,,,,,,,NaT,0,424\nSarah Talbot died 7th.. of 11th.. month 1...,13,,True


id
mss4-b5-f1-001     5
mss4-b5-f1-002     5
mss4-b5-f1-003     5
mss4-b5-f1-004     5
mss4-b5-f2-001     5
                  ..
mss4-b9-f24-002    9
mss4-b9-f4-001     9
mss4-b9-f4-002     9
mss4-b9-f7-019     9
mss4-b9-f9-002     9
Name: box, Length: 4584, dtype: int64

In [146]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def write_excel(df, path, col_widths=None):
    df.to_excel(path)

def truncfn(fn):
    return '...' + fn[-50:] if len(fn) > 50 else fn

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)

def default_tokenize(txt):
    tokens = re.findall(r"[\w']+|[.,!?; -—–\n]", txt.lower())
    return tokens

class BaseTopicModel:
    model_type = 'topicmodel_type'

    attrs_set_after_modeling = [
        'doc_df', 'work_df', 'path_ldavis', 'path_model', 'path_index', 'path_params',
        'path_topicdf', 'path_hclust', 'path_docdf', 'topic_term_dists', 'doc_topic_dists',
        'doc_lengths', 'vocab', 'term_frequency', 'doc_topic_dists_df', 'id2cluster',
        'meta', 'topic_names', 'num_topics'
    ]

    def __init__(self, df, text_column='txt', tokenizer=default_tokenize, **query_kwargs):
        self.df = df
        self.text_column = text_column
        self.query_kwargs = query_kwargs
        self._mdl = None
        self.id2index = {}
        self.index2id = {}
        self._id_docs, self._ids, self._docs = None, None, None
        self.path_topicmodels = os.path.join('topicmodels', self.model_type)
        self.paths = {}
        self.tokenizer = tokenizer

    def get_paths(self, output_dir=None, ntopic=None, niter=None, lemmatize=False):
        if output_dir:
            path = os.path.abspath(output_dir)
        else:
            pieces = {
                **self.query_kwargs,
                'ntopic': ntopic,
                'niter': niter,
                'lemmatize': lemmatize,
            }
            def get_fn(pieces): return '.'.join(f'{k}_{v}' for k, v in pieces.items() if v)
            path = os.path.join(self.path_topicmodels, self.model_type, get_fn(pieces))

        outd = dict(
            path=path,
            path_model=os.path.join(path, 'model.bin'),
            path_index=os.path.join(path, 'index.json'),
            path_params=os.path.join(path, 'params.json'),
            path_docdf=os.path.join(path, 'documents.xlsx'),
            path_topicdf=os.path.join(path, 'topics.xlsx'),
            path_ldavis=os.path.join(path, 'ldavis'),
            path_hclust=os.path.join(path, 'hclust.html'),
        )
        self.paths = outd
        return outd

    def __getattr__(self, k):
        if k.startswith('path'): return self.paths.get(k)
        return None

    def iter_docs(self, lim=None, as_str=False, lemmatize=False):
        for idx, text in self.df[self.text_column].items():
            yield idx, self.tokenizer(text) if not as_str else text

    def model(self, **kwargs):
        raise NotImplementedError("Implement the model() method in a subclass.")

    @cached_property
    def mdl(self):
        if self._mdl is None: self.model()
        return self._mdl

    def init_docs(self, lim=None, force=False, lemmatize=False, as_str=False):
        if force or self._id_docs is None:
            self._id_docs = list(self.iter_docs(lim=lim, lemmatize=lemmatize, as_str=as_str))
        return self.docs

    @cached_property
    def id_docs(self):
        if self._id_docs is None: self.init_docs()
        return self._id_docs

    @cached_property
    def docs(self): return [y for x, y in self.id_docs]
    @cached_property
    def ids(self): return [x for x, y in self.id_docs]
    @cached_property
    def doc2id(self): return {y: x for x, y in self.id_docs}
    @cached_property
    def id2doc(self): return {x: y for x, y in self.id_docs}

class TomotopyTopicModel(BaseTopicModel):
    model_type = 'tomotopy'

    def model(self, output_dir=None, force=False, lim=None, lemmatize=False, ntopic=25, niter=100):
        with logmap('loading or modeling LDA model') as lw:
            # Get filename
            pathd = self.get_paths(output_dir=output_dir, ntopic=ntopic, niter=niter, lemmatize=lemmatize)
            fdir = pathd['path']
            os.makedirs(fdir, exist_ok=True)
            fn = pathd['path_model']
            fnindex = pathd['path_index']
            fnparams = pathd['path_params']

            # Reset caches
            for attr in self.attrs_set_after_modeling:
                if attr in self.__dict__:
                    del self.__dict__[attr]

            # Save or load
            if force or not os.path.exists(fn) or not os.path.exists(fnindex):
                mdl = self.mdl = tp.LDAModel(k=ntopic)
                docd = self.id2index = {}
                for doc_id, doc_tokens in self.iter_docs():
                    docd[doc_id] = mdl.add_doc(doc_tokens)

                def getdesc():
                    return f'{lw.inner_pref}training model (ndocs={len(docd)}, log-likelihood = {mdl.ll_per_word:.4})'

                pbar = lw.iter_progress(list(range(0, niter, 1)), desc=getdesc(), position=0)
                for i in pbar:
                    lw.set_progress_desc(getdesc())
                    mdl.train(1)
                mdl.save(fn)
                lw.log(f'saved: {fn}')
                with open(fnindex, 'wb') as of:
                    of.write(orjson.dumps(docd, option=orjson.OPT_INDENT_2))
                lw.log(f'saved: {fnindex}')

                params = {
                    **dict(
                        ntopic=ntopic,
                        niter=niter,
                        lim=lim,
                        lemmatize=lemmatize
                    ),
                    **self.query_kwargs
                }
                with open(fnparams, 'wb') as of:
                    of.write(orjson.dumps(params, option=orjson.OPT_INDENT_2))

                self.index2id = {v: k for k, v in self.id2index.items()}

                # Write docs
                write_excel(self.doc_df.reset_index(), self.path_docdf, col_widths={'page_text': 80})
                write_excel(self.topic_df.reset_index(), self.path_topicdf)
                self.save_pyldavis(force=True)
            else:
                lw.log(f'loading: {fn}')
                self.mdl = tp.LDAModel.load(fn)
                with open(fnindex, 'rb') as f: self.id2index = orjson.loads(f.read())
                self.index2id = {v: k for k, v in self.id2index.items()}
                self.__dict__['doc_df'] = pd.read_excel(self.path_docdf).set_index('id')

    @cached_property
    def topic_term_dists(self):
        return np.stack([self.mdl.get_topic_word_dist(k) for k in range(self.mdl.k)])

    @cached_property
    def doc_topic_dists(self):
        doc_topic_dists = np.stack([doc.get_topic_dist() for doc in self.mdl.docs])
        doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
        return doc_topic_dists

    @cached_property
    def doc_lengths(self):
        return np.array([len(doc.words) for doc in self.mdl.docs])

    @cached_property
    def vocab(self): return list(self.mdl.used_vocabs)
    @cached_property
    def term_frequency(self): return self.mdl.used_vocab_freq

    def save_pyldavis(self, output_dir=None, force=False):
        output_dir = self.path_ldavis if not output_dir else output_dir
        fn = os.path.join(output_dir, 'index.html')
        if not force and os.path.exists(fn): return fn

        with logmap('saving pyldavis output') as lw:
            prepared_data = pyLDAvis.prepare(
                self.topic_term_dists,
                self.doc_topic_dists,
                self.doc_lengths,
                self.vocab,
                self.term_frequency,
                start_index=0,
                sort_topics=False
            )
            os.makedirs(output_dir, exist_ok=True)
            lw.log(f'saving: {fn}')
            pyLDAvis.save_html(prepared_data, fn)
        return fn

    def visualize_topics(self):
        from IPython.display import HTML
        ldavis_fn = self.save_pyldavis()
        return HTML(filename=ldavis_fn)

    @cached_property
    def doc_df(self):
        page_ids, values = zip(*[(self.index2id[i], x) for i, x in enumerate(self.doc_topic_dists) if i in self.index2id])
        dftopicdist = pd.DataFrame(values)
        dftopicdist['id'] = page_ids
        return dftopicdist.set_index('id')

    @cached_property
    def meta(self): return self.df

    @cached_property
    def topic_names(self, top_n=25):
        d = {}
        for topic_id in range(self.mdl.k):
            d[topic_id] = f'Topic {topic_id}: {" ".join(w for w, c in self.mdl.get_topic_words(topic_id, top_n=top_n))}'
        return d

    @cached_property
    def num_topics(self): return self.mdl.k

    @cached_property
    def topic_df(self):
        with logmap('collecting topic data') as lw:
            tdf = pd.DataFrame({
                'topic_id': list(range(self.num_topics)),
                'topic_name': [self.topic_names[i] for i in range(self.num_topics)],
                'topic_words': [' '.join(w for w, c in self.mdl.get_topic_words(i, top_n=100)) for i in range(self.num_topics)]
            })
        return tdf.set_index('topic_id')


In [161]:
tm = TopicModel(df[~df.is_journal].reset_index().drop_duplicates('id').set_index('id'), tokenizer=tokenize)

In [162]:
tm.model(niter=100, ntopic=75, force=True)

[34m[1m⎾ loading or modeling LDA model[0m[36m @ 2024-07-11 22:18:09,834[0m
[1;34m￨ ￨ training model (ndocs=2881, log-likelihood = -9.244): 100%|[0;36m██████████[0;36m| 100/100 [00:10<00:00,  9.90it/s]
[34m[1m￨ saved: topicmodels/tomotopy/tomotopy/ntopic_75.niter_100/model.bin[0m[36m @ 2024-07-11 22:18:21,558[0m
[34m[1m￨ saved: topicmodels/tomotopy/tomotopy/ntopic_75.niter_100/index.json[0m[36m @ 2024-07-11 22:18:21,562[0m
[34m[1m￨ ⎾ collecting topic data[0m[36m @ 2024-07-11 22:18:25,430[0m
[34m[1m￨ ⎿ 0.1 seconds[0m[36m @ 2024-07-11 22:18:25,573[0m
[34m[1m￨ ⎾ saving pyldavis output[0m[36m @ 2024-07-11 22:18:25,599[0m
[34m[1m￨ ￨ saving: topicmodels/tomotopy/tomotopy/ntopic_75.niter_100/ldavis/index.html[0m[36m @ 2024-07-11 22:18:45,363[0m
[34m[1m￨ ⎿ 19.8 seconds[0m[36m @ 2024-07-11 22:18:45,411[0m
[34m[1m⎿ 35.6 seconds[0m[36m @ 2024-07-11 22:18:45,412[0m


In [163]:
tm.visualize_topics()

In [164]:
tnum=20
tm.doc_df.sort_values(tnum, ascending=False)[tnum]

id
mss4-b5-f45-009     0.284091
mss4-b9-f7-024      0.265913
mss4-b8-f34-007     0.234742
mss4-b10-f10-001    0.221778
mss4-b9-f7-009      0.217586
                      ...   
mss4-b8-f20-002     0.000091
mss4-b10-f26-021    0.000088
mss4-b9-f15-015     0.000070
mss4-b2-f4-002      0.000064
mss4-b1-f4-001-7    0.000061
Name: 20, Length: 2376, dtype: float32

In [165]:
print(tm.df.loc['mss4-b5-f45-009'].txt)

Ballybainey 18th Nov.r 1817
Esteemed Friend
With respect to the people whose names
thee has handed me. [unclear]Viz[/unclear] Michl. Murray, Ned
Lawler, Anne Doyl, Peter Maher, Denis Kenedy, and
John Dowling, the stand as follows. Micl. Murray
owes the 1st day of Novr. last a year & a half's Rent, a Years
Rent he will pay now at Xms the usual time th[insertion]e[/insertion]y pay
Ned [unclear]Sawles[/unclear] a year and a half also but great part of this is
paid and the rest will be paid at Xms and at May
Lawler will be Clear up to the day so will Murray
and D. Kenedy also. Kenedy owes a year & a half the
year he will pay at Xms as usual and clear up at [insertion]May[/insertion]
to the day Peter Maher owes only a years Rent last
Novr. so that he clears of now at Xms. indeed he would
clear of Tomorrow if call'd on. Anne Doyle holds
from her mother the Widow Lewis she
[unclear][/unclear] Leadbeater
Ballitore
L
L
Ley

I Know sent her Rent the other day with Mr. John
Jackson and Anne Doyle

In [101]:
# Topics more likely to be in journals? (any distinction?)
# Topics most likely by sender?

df2 = tm.df.merge(tm.doc_df, on='id')
df2

Unnamed: 0_level_0,sender,recipient,date,location from,location to,notes,datetime,dateyear,txt,box,...,65,66,67,68,69,70,71,72,73,74
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mss4-b5-f1-001,Elizabeth Abell,Mary Shackleton Leadbeater,April 4 1778,Cork,Ballitore,,1778-04-04,1778,I would wish to write a few lines to my dear f...,5,...,0.000044,0.000112,0.000088,0.000114,0.000808,0.000081,0.016889,0.000100,0.000077,0.000066
mss4-b5-f1-002,Elizabeth Abell,Mary Shackleton Leadbeater,Jan 12 1782,Cork,Ballitore,,1782-01-12,1782,I recd. my Dear Molly Shackletons kind favour\...,5,...,0.000033,0.000083,0.000065,0.031770,0.000600,0.000060,0.032345,0.000074,0.011939,0.000049
mss4-b5-f1-003,Elizabeth Abell,Mary Shackleton Leadbeater,Oct 20 1783,Cork,[unclear],,1783-10-20,1783,Cork the 20th of 10th m. 1785\nMy Dear Mary\nI...,5,...,0.000039,0.000100,0.000078,0.000101,0.071985,0.000072,0.010293,0.000089,0.000069,0.000059
mss4-b5-f1-004,Elizabeth Abell,Mary Shackleton Leadbeater,Feb 19 1788,Cork,Ballitore,,1788-02-19,1788,I doubt not but my dear Friend will be\nrather...,5,...,0.000023,0.000059,0.000046,0.000060,0.006052,0.000043,0.037038,0.030997,0.002854,0.000035
mss4-b5-f2-001,James Abell,Mary Shackleton Leadbeater,12/29/1786,Cork,Ballitore,,1786-12-29,1786,My dear Molly - if thoull bear being call'd so...,5,...,0.000023,0.000059,0.000046,0.000059,0.028318,0.000042,0.011622,0.053054,0.000040,0.000034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mss4-b9-f24-002,,,,,,,NaT,0,which I expended some 400 more in improvements...,9,...,0.000021,0.000055,0.000043,0.000055,0.000392,0.000039,0.013368,0.000049,0.000037,0.000032
mss4-b9-f4-001,,,,,,,NaT,0,Limerick 18 of November 1823\nMy dear brother ...,9,...,0.000019,0.043183,0.000037,0.000048,0.000344,0.002305,0.000378,0.000043,0.000033,0.000028
mss4-b9-f4-002,,,,,,,NaT,0,vexation a pleasure-tourist sometimes undergoe...,9,...,0.000042,0.238105,0.000083,0.000108,0.005830,0.000077,0.026161,0.000095,0.000073,0.000063
mss4-b9-f7-019,,,,,,,NaT,0,for the [unclear][/unclear] of Earl [unclear][...,9,...,0.000124,0.000317,0.000247,0.000320,0.002277,0.000229,0.002503,0.000283,0.000218,0.000186


In [104]:
df_senders = df2.groupby('sender').mean(numeric_only=True).drop(['dateyear','box','Unnamed: 0'],axis=1)
df_senders

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
sender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.000694,0.082282,0.001011,0.169448,0.000622,0.001301,0.000698,0.001731,0.002773,0.006032,...,0.000558,0.006332,0.008294,0.001243,0.018317,0.001104,0.015280,0.001199,0.002732,0.001222
"""Eliza Matilda Arabella""",0.000039,0.007157,0.000048,0.000834,0.000026,0.000046,0.000032,0.000052,0.000075,0.000107,...,0.000041,0.000104,0.000081,0.000105,0.000744,0.019733,0.000818,0.000092,0.000071,0.118008
"""Snooks""",0.000549,0.169628,0.000670,0.011719,0.069429,0.000642,0.000447,0.000735,0.001053,0.001508,...,0.000571,0.070515,0.001136,0.001471,0.079516,0.001050,0.011490,0.001298,0.000999,0.000853
A. J.,0.000031,0.001793,0.000038,0.028174,0.000021,0.000037,0.000025,0.055056,0.000060,0.000086,...,0.000032,0.000083,0.000065,0.000084,0.008454,0.000060,0.000654,0.000074,0.000057,0.000049
A. Mills,0.000081,0.028352,0.013708,0.001726,0.000054,0.000095,0.000066,0.000108,0.000155,0.000222,...,0.000084,0.000214,0.000167,0.000217,0.001540,0.000155,0.038734,0.000191,0.000147,0.050637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
William Gatchell,0.000167,0.283408,0.000204,0.003574,0.000113,0.000196,0.000136,0.000224,0.000321,0.000460,...,0.000174,0.000444,0.000346,0.000449,0.003189,0.000320,0.003504,0.000396,0.000305,0.000260
William Hunt,0.000059,0.032892,0.000072,0.001253,0.066470,0.000069,0.000048,0.000079,0.000112,0.000161,...,0.000061,0.000156,0.000121,0.000157,0.001118,0.000112,0.163613,0.000139,0.000107,0.000091
William Leadbeater,0.000110,0.071303,0.000134,0.015920,0.000074,0.000129,0.000090,0.000148,0.000211,0.000870,...,0.000115,0.000587,0.000714,0.000898,0.024258,0.000211,0.020850,0.000746,0.002200,0.002012
William Newcombe,0.000300,0.017230,0.000366,0.006408,0.000202,0.000351,0.000245,0.000402,0.000576,0.000825,...,0.000312,0.000796,0.000621,0.000804,0.005718,0.000574,0.006283,0.000710,0.000546,0.000467


In [105]:
tm.df.sender.value_counts()

sender
                               2230
Richard Shackleton              349
Sarah Leadbeater Barrington     200
William Rayner                   99
Mary Shackleton Leadbeater       74
                               ... 
Thomas Wray                       1
Susanna Wright                    1
Margaret Foster                   1
Joseph Garratt                    1
Mary Doyle                        1
Name: count, Length: 391, dtype: int64

In [106]:
rs='Richard Shackleton'
ml='Mary Shackleton Leadbeater'
df_senders.loc[rs].sort_values(ascending=False)

71    0.282832
58    0.074850
59    0.067598
13    0.064711
24    0.060937
        ...   
21    0.000173
39    0.000171
19    0.000167
65    0.000158
4     0.000073
Name: Richard Shackleton, Length: 75, dtype: float64

In [107]:
df_senders.loc[ml].sort_values(ascending=False)

1     0.103062
37    0.102173
59    0.095330
41    0.069881
55    0.065442
        ...   
6     0.000100
4     0.000074
19    0.000068
63    0.000063
45    0.000031
Name: Mary Shackleton Leadbeater, Length: 75, dtype: float64

In [109]:
from pprint import pprint
pprint(dict(tm.topic_df.loc[1]))

{'topic_name': 'Topic 1: well home much hope come last time little letter soon '
               'better since yesterday left going think night next could '
               'morning mary heard came hear days',
 'topic_words': 'well home much hope come last time little letter soon better '
                'since yesterday left going think night next could morning '
                'mary heard came hear days good week still quite great evening '
                'also coming write without says getting take leave back first '
                'long give seems tell intend weather expect must pretty post '
                'deal dublin town stay told pleasant sent cold found visit '
                'return suppose believe greatly meet room gone might tomorrow '
                'news glad short expected except poor looks note best place '
                'usual fine away sally make arrived written disappointed every '
                'taken received shall able saying obliged however kind account 

In [110]:
from pprint import pprint
pprint(dict(tm.topic_df.loc[71]))

{'topic_name': 'Topic 71: dear good friend thou life month great richard well '
               'divine shackleton children best though wisdom ballitore every '
               'service cause spirit help experience little believe '
               'affectionate',
 'topic_words': 'dear good friend thou life month great richard well divine '
                'shackleton children best though wisdom ballitore every '
                'service cause spirit help experience little believe '
                'affectionate religious others pleased spirits favour society '
                'friends trust times sense doubt preserved among right thine '
                'hand season favoured things thee duty heart care indeed state '
                'degree manner providence master humble power evil sensible '
                'spiritual inward gracious child tender another hast poor time '
                'affection friendship humility part particular cousin support '
                'sometimes infinite h

In [111]:
tm.topic_df

Unnamed: 0_level_0,topic_name,topic_words
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Topic 0: thou went eliza hannah betsy jane ala...,thou went eliza hannah betsy jane alas many ni...
1,Topic 1: well home much hope come last time li...,well home much hope come last time little lett...
2,Topic 2: capt ship board brother island plymou...,capt ship board brother island plymouth countr...
3,Topic 3: went papa polly came morning aunt din...,went papa polly came morning aunt dinner eveni...
4,Topic 4: dear friend thee thou love poor hear ...,dear friend thee thou love poor hear well woul...
...,...,...
70,Topic 70: love wilkinson sister dear health we...,love wilkinson sister dear health well brother...
71,Topic 71: dear good friend thou life month gre...,dear good friend thou life month great richard...
72,Topic 72: wou'd woud poor cou'd family shou'd ...,wou'd woud poor cou'd family shou'd frds acct ...
73,Topic 73: thought found till seemed made well ...,thought found till seemed made well soldiers a...


In [112]:
tm.visualize_topics()

In [130]:

class BertTopicModel(BaseTopicModel):
    model_type = 'bertopic'
    embedding_model_name = 'emanjavacas/MacBERTh'

    @cached_property
    def embedding_model(self):
        from transformers.pipelines import pipeline
        embedding_model = pipeline(
            "feature-extraction",
            model=self.embedding_model_name,
        )
        return embedding_model

    def model(self, output_dir=None, force=False, lim=None, save=True, embedding_model=None, lemmatize=True, **kwargs):
        with logmap('loading or generating model'):
            # Get filename
            pathd = self.get_paths(output_dir=output_dir)
            fdir = pathd['path']
            os.makedirs(fdir, exist_ok=True)
            fn = pathd['path_model']
            if not force and os.path.exists(fn): return self.load(fn)

            with logmap('importing BERTopic'):
                os.environ['TOKENIZERS_PARALLELISM'] = 'false'
                from bertopic import BERTopic
                from bertopic.representation import KeyBERTInspired

            # Get docs
            docs = self.init_docs(lim=lim, as_str=True)

            with logmap('fitting model'):
                self._mdl = BERTopic(
                    embedding_model=embedding_model,
                    representation_model=KeyBERTInspired(),
                    verbose=True,
                    **kwargs
                )
                self._topics, self._probs = self._mdl.fit_transform(docs)
            self._mdl.generate_topic_labels(nr_words=10)
            if save: self.save(fn)
            return self._mdl

    def save(self, fn=None):
        if self._mdl is not None:
            fn = self.path_model if fn is None else fn
            with logmap(f'saving model to disk: {truncfn(fn)}'):
                ensure_dir(fn)
                # self.mdl.save(fn)
                write_excel(self.doc_df.reset_index(), self.path_docdf, col_widths={'page_text': 80, 'document': 80})
                write_excel(self.topic_df.reset_index(), self.path_topicdf, col_widths={'representative_doc1': 80, 'representative_doc2': 80, 'representative_doc3': 80})

    def load(self, fn=None):
        if not fn: fn = self.path_model
        if os.path.exists(fn):
            with logmap('importing BERTopic'):
                from bertopic import BERTopic
            with logmap(f'loading model from disk: {truncfn(fn)}'):
                self._mdl = BERTopic.load(self.path_model)
                return self._mdl

    @cached_property
    def doc_df(self):
        docinfo = self.mdl.get_document_info(self.docs)
        docinfo.columns = [x.lower() for x in docinfo]
        docinfo['page_id'] = [self.doc2id[doc] for doc in docinfo.document]
        docinfo = docinfo.drop('representative_docs', axis=1)
        docinfo['page_text'] = [self.df.loc[id, self.text_column] for id in docinfo.page_id]
        for x in ['representation']:
            docinfo[x] = docinfo[x].apply(lambda x: ' '.join(x))
        return docinfo[[c for c in docinfo if c != 'document'] + ['document']]

    @cached_property
    def topic_df(self):
        tdf = self.mdl.get_topic_info()
        tdf.columns = [x.lower() for x in tdf]
        tdf['representative_docs_ids']=[[self.doc2id[doc] for doc in docs] for docs in tdf.representative_docs]
        tdf = tdf.drop('representative_docs',axis=1)
        for i in range(3):
            tdf[f'representative_doc{i+1}']=[self.df.loc[reprids[i]].txt for reprids in tdf.representative_docs_ids]

        for x in ['representation','representative_docs_ids']:
            tdf[x]=tdf[x].apply(lambda x: ' '.join(x))
        return tdf

    @cached_property
    def page2topic(self):
        return dict(zip(self.doc_df.page_id,self.doc_df.name))

    @cached_property
    def hierarchical_topics(self):
        return self.mdl.hierarchical_topics(self.docs)

    def visualize_hierarchy(self):
        fig = self.mdl.visualize_hierarchy(hierarchical_topics=self.hierarchical_topics)
        fig.write_html(self.path_hclust)
        return fig



def TopicModel(*args, model_type='tomotopy',**kwargs):
    if model_type:
        mtyp = model_type.lower()
        if mtyp.startswith('tomo') or mtyp.startswith('lda'):
            return TomotopyTopicModel(*args, **kwargs)
    return BertTopicModel(*args, **kwargs)



In [131]:
tm2 = TopicModel(df, model_type='bertopic', tokenizer=tokenize)
tm2.model(force=True)

[34m[1m⎾ loading or generating model[0m[36m @ 2024-07-11 21:54:24,069[0m
[34m[1m￨ ⎾ importing BERTopic[0m[36m @ 2024-07-11 21:54:24,071[0m
[34m[1m￨ ⎿ 0 seconds[0m[36m @ 2024-07-11 21:54:24,073[0m
[34m[1m￨ ⎾ fitting model[0m[36m @ 2024-07-11 21:54:24,082[0m
2024-07-11 21:54:24,088 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/144 [00:00<?, ?it/s]

2024-07-11 21:54:42,145 - BERTopic - Embedding - Completed ✓
2024-07-11 21:54:42,146 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-11 21:54:54,951 - BERTopic - Dimensionality - Completed ✓
2024-07-11 21:54:54,954 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-11 21:54:55,107 - BERTopic - Cluster - Completed ✓
2024-07-11 21:54:55,114 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-11 21:54:58,931 - BERTopic - Representation - Completed ✓
[34m[1m￨ ⎿ 35.7 seconds[0m[36m @ 2024-07-11 21:54:59,804[0m
[34m[1m￨ ⎾ saving model to disk: topicmodels/bertopic/bertopic/model.bin[0m[36m @ 2024-07-11 21:54:59,806[0m
[34m[1m￨ ⎿ 5.3 seconds[0m[36m @ 2024-07-11 21:55:05,123[0m
[34m[1m⎿ 41.1 seconds[0m[36m @ 2024-07-11 21:55:05,124[0m


<bertopic._bertopic.BERTopic at 0x7a3b2f317f70>

In [132]:
tm2.mdl.visualize_topics()

In [133]:
tm2.visualize_hierarchy()

100%|██████████| 60/60 [00:14<00:00,  4.18it/s]
