# Topic modeling for *Journal of Jesuit Studies*
This Jupyter Notebook contains Python code for topic modeling of 322 articles from *Journal of Jesuit Studies* using [BERTopic](https://maartengr.github.io/BERTopic/index.html) technique.

## Instalation of packages

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from umap import UMAP
import spacy
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from glob import glob

## Creation of predefined functions for the pre-processing of article text

In [9]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

def split_text(text, max_length=100):
    words = text.split()
    return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

## Upload of texts

In [10]:
path = r"C:\Users\Cezary\Documents\Monita-privata\data\konferencja-poznan\txt/"
txt_files = [f for f in glob(f"{path}*", recursive=True)]

txt_dict = {}
for txt_file in tqdm(txt_files):
    text_key = txt_file.split('\\')[-1].split('.')[0]
    with open(txt_file, 'rt', encoding='utf-8') as f:
        text_value = f.read()
    txt_dict.update({text_key: text_value})

texts = list(txt_dict.values())

processed_texts = [preprocess_text(text) for text in tqdm(texts)]
split_texts = []
for text in tqdm(processed_texts):
    split_texts.extend(split_text(text))
split_texts = [text for text in split_texts if text.strip() != '']

if len(split_texts) < 2:
    raise ValueError("Niewystarczająca liczba tekstów po przetwarzaniu wstępnym. Dodaj więcej danych wejściowych.")

100%|███████████████████████████████████████████████████████████████████████████████| 322/322 [00:01<00:00, 268.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 322/322 [03:16<00:00,  1.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 322/322 [00:00<00:00, 11232.44it/s]


## Model training for topic modeling

In [13]:
stop_words = list(stopwords.words('english'))

# sentence_model = SentenceTransformer("allegro/herbert-base-cased")
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

class EnglishEmbedder(BaseEmbedder):
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        return self.embedding_model.encode(documents, show_progress_bar=verbose)

english_embedder = EnglishEmbedder(sentence_model)
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2))
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')

topic_model = BERTopic(
    embedding_model=english_embedder,
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    top_n_words=10,
    n_gram_range=(1, 2),
    min_topic_size=10,
    calculate_probabilities=True
)

try:
    topics, probabilities = topic_model.fit_transform(split_texts)
except ValueError as e:
    print(f"Error during model fitting: {e}")
    print("Texts:", split_texts)
    raise

TypeError: 'numpy.float64' object cannot be interpreted as an integer

## Printing table with topics

In [12]:
print(topic_model.get_topic_info())
topic_info = topic_model.get_topic_info()
topic_info.to_excel('jojs_topics_info.xlsx', index=False)

TypeError: 'numpy.float64' object cannot be interpreted as an integer

## Visualisation of topics via 2D representation

In [None]:
topic_model.visualize_topics()

## Visualisation of topic hierarchy

In [None]:
topic_model.visualize_hierarchy()

## Visualisation of topic word scores

In [None]:
topic_model.visualize_barchart()

## Visualisation of topic similarity

In [None]:
topic_model.visualize_heatmap()