In [5]:
import os
import pathlib
from data_processor import TopicModelDataPreprocessor
from fastopic import FASTopic
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Combine multiple stop word sources
def get_comprehensive_stopwords():
    nltk_stops = set(stopwords.words('english'))
    sklearn_stops = ENGLISH_STOP_WORDS
    custom_stops = {
        'et', 'al', 'introduction', 'conclusion', 'method', 'methodology', 
        'results', 'discussion', 'references', 'appendix', 'table', 'figure'
    }
    return nltk_stops.union(sklearn_stops).union(custom_stops)

# Advanced text cleaning function
def advanced_preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Get comprehensive stop words
    stop_words = get_comprehensive_stopwords()
    
    # Advanced filtering and lemmatization
    cleaned_tokens = [
        lemmatizer.lemmatize(token) 
        for token in tokens 
        if (token not in stop_words and 
            len(token) > 2 and  # Remove very short tokens
            not token.isdigit())  # Remove pure digit tokens
    ]
    
    return ' '.join(cleaned_tokens)

# Set the path to your PDF folder
all_pdf_folder_path = '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/aekowals'
path_all_pdf_folder_path = pathlib.Path(all_pdf_folder_path)

# Process the PDF files
topic_model_processor = TopicModelDataPreprocessor(all_pdf_folder_path=path_all_pdf_folder_path)
docs = topic_model_processor.get_and_process_pdf_files(path_all_pdf_folder_path)

# Preprocess documents using advanced preprocessing function
processed_docs_text = [advanced_preprocess(" ".join(doc)) for doc in docs]

# Initialize FastTopic model with a reasonable number of topics
num_topics = 10  # Adjust this number based on your needs and dataset size
model = FASTopic(num_topics=num_topics)

# Fit the model on the preprocessed documents and get topic-word distributions and document-topic distributions
topic_top_words, doc_topic_dist = model.fit_transform(processed_docs_text)

# Print top words for each topic to identify main topics
print("Top words for each topic:")
for topic_idx in range(num_topics):
    top_words = model.get_topic(topic_idx)
    print(f"Topic {topic_idx}: {', '.join([word for word, prob in top_words[:5]])}")

# Print overall topic distribution across all documents to find dominant topics
overall_dist = np.mean(doc_topic_dist, axis=0)
top_topics = np.argsort(overall_dist)[::-1][:7]
print("\nTop 7 topics by overall distribution:")
for topic in top_topics:
    top_words = model.get_topic(topic)
    print(f"Topic {topic} ({overall_dist[topic]:.4f}): {', '.join([word for word, prob in top_words[:5]])}")

2024-11-15 11:18:17,569 - INFO - collecting all words and their counts
2024-11-15 11:18:17,572 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-11-15 11:18:17,796 - INFO - collected 107297 token types (unigram + bigrams) from a corpus of 240768 words and 30 sentences
2024-11-15 11:18:17,797 - INFO - merged Phrases<107297 vocab, min_count=2, threshold=10, max_vocab_size=40000000>
2024-11-15 11:18:17,798 - INFO - Phrases lifecycle event {'msg': 'built Phrases<107297 vocab, min_count=2, threshold=10, max_vocab_size=40000000> in 0.23s', 'datetime': '2024-11-15T11:18:17.798090', 'gensim': '4.3.0', 'python': '3.11.7 (main, Dec 15 2023, 18:12:31) [GCC 11.2.0]', 'platform': 'Linux-4.18.0-477.51.1.el8_8.x86_64-x86_64-with-glibc2.28', 'event': 'created'}
2024-11-15 11:18:17,798 - INFO - exporting phrases from Phrases<107297 vocab, min_count=2, threshold=10, max_vocab_size=40000000>
2024-11-15 11:18:17,918 - INFO - FrozenPhrases lifecycle event {'msg': 'exported FrozenPh

Top words for each topic:
Topic 0: breast, cancer, mammography, mammogram, overdiagnosis
Topic 1: injury, deductible, frac, stoploss, coinsurance
Topic 2: county, golden, tstatistic, postprereform, asthma
Topic 3: logexp, default, string, command, specied
Topic 4: payment, waiver, vote, campaign, candidate
Topic 5: birth, gram, newborn, taker, lottery
Topic 6: tho, rock, kill, prowess, dead
Topic 7: engel, alcohol, selector, uncensored, theorem
Topic 8: pre, markup, watermarktext, prereform, rating
Topic 9: eshi, hour, deadweight, mandatebased, valuation

Top 7 topics by overall distribution:
Topic 5 (0.1099): birth, gram, newborn, taker, lottery
Topic 8 (0.1088): pre, markup, watermarktext, prereform, rating
Topic 2 (0.1044): county, golden, tstatistic, postprereform, asthma
Topic 1 (0.1044): injury, deductible, frac, stoploss, coinsurance
Topic 4 (0.1043): payment, waiver, vote, campaign, candidate
Topic 9 (0.1037): eshi, hour, deadweight, mandatebased, valuation
Topic 3 (0.0946): lo