In [9]:
'''
2) Check Open Alex, Crossref, (Arxiv) for pdfs
3) if unique add to pdf storage and add name to title column 
'''
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [10]:
import pathlib
from utils import topic_model_config

email = topic_model_config.EMAIL
db_path = pathlib.Path(topic_model_config.DATABASE_PATH)
all_pdf_path = pathlib.Path(topic_model_config.ALL_PDF_FOLDER_PATH)
selected_model_name = topic_model_config.MODEL

In [11]:
from database.database_manager import DatabaseManager
from publication_API.crossref_api import CrossrefAPI
from publication_API.arxiv_api import ArxivAPI
from publication_API.open_alex import OpenAlex



database_manager = DatabaseManager(db_path)
crossref_api = CrossrefAPI(email=email, rows=10, database_manager=database_manager)
open_alex_api = OpenAlex(database_manager=database_manager)
# arxiv_api = ArxivAPI(rows=10, database_manager=database_manager)

author_list = database_manager.get_all_authors()

In [5]:
#Pull all publications
for author_id in author_list:
    crossref_api.api_call(author_id)
    open_alex_api.api_call(author_id)
    # arxiv_api.api_call(author_id)

2024-11-15 19:23:11,329 - INFO - Searching for publications using crossref started...
2024-11-15 19:23:14,247 - INFO - No URL - skipping publication The Early Impact of the Affordable Care Act State-By-State
2024-11-15 19:23:19,795 - INFO - PDF already exists at /nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/aekowals/10256018808623883.pdf, skipping download.
2024-11-15 19:23:19,796 - INFO - No URL - skipping publication Censored Quantile Instrumental Variable Estimates of the Price Elasticity of Expenditure on Medical Care
2024-11-15 19:23:19,796 - INFO - No URL - skipping publication What Do Longitudinal Data on Millions of Hospital Visits Tell us About The Value of Public Health Insurance as a Safety Net for the Young and Privately Insured?
2024-11-15 19:23:19,796 - INFO - No URL - skipping publication Estimating the Tradeoff Between Risk Protection and Moral Hazard with a Nonlinear Budget Set Model of Health Insurance
2024-11-15 19:23:19,797 - INFO - No URL - skippin

KeyboardInterrupt: 

In [12]:
from NLP.data_processor import TopicModelDataPreprocessor
from NLP.hdp import HDPTopicModel
# from NLP.bert import BERTTopicModel
# from NLP.fastopic import FASTopicModel
from NLP.lda import LDATopicModel
from NLP.zeroshot import ZeroShotClassifier
# from NLP.guidedlda import GuidedLDATopicModel

topic_model_processor = TopicModelDataPreprocessor()

match selected_model_name:
    case "hdp":
        for author_folder in all_pdf_path.iterdir():
            if author_folder.is_dir():
                docs = topic_model_processor.get_and_process_pdf_files(author_folder=author_folder)
                model_obj = HDPTopicModel()
                model_obj.train(docs)
                topics = model_obj.get_topics()
                topics = model_obj.log_topics(topics)
    # case "bert":
    #     model = BERTTopicModel()
    #     def get_topics(model, docs):
    #         model.train_embeddings(docs)
    #         return model.extract_topics(n=10)
    # case "fastopic":
    #     model = FASTopicModel()
    #     def get_topics(model, docs):
    #         model.fit(docs)
    #         return model.retrieve_topics(limit=10)
    case "lda":
        for author_folder in all_pdf_path.iterdir():
            if author_folder.is_dir():
                docs = topic_model_processor.get_and_process_pdf_files(author_folder=author_folder)
                model_obj = LDATopicModel()
                lda_model, corpus, dictionary = model_obj.train(docs)
                topics = model_obj.get_topics(lda_model=lda_model)
    case "zeroshot":
        for author_folder in all_pdf_path.iterdir():
            if author_folder.is_dir():
                docs = topic_model_processor.get_entire_author_text(author_folder=author_folder)
                model_obj = ZeroShotClassifier()
                results_df = model_obj.classify_with_confidence_threshold(docs)
                quality_analysis = model_obj.analyze_classification_quality(results_df)
                model_obj.log_topics(results_df, quality_analysis)
                
    # case "guidedlda":
    #     model = GuidedLDATopicModel()
    #     def get_topics(model, docs):
    #         model.initialize(docs)
    #         return model.topic_distribution(top_n=10)
    case _:
        raise ValueError(f"Unsupported topic model: {selected_model_name}")

# Process documents and retrieve topics




2024-11-15 19:45:21,242 - INFO - Note: NumExpr detected 36 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-11-15 19:45:21,243 - INFO - NumExpr defaulting to 8 threads.
[nltk_data] Downloading package stopwords to /home/hudah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-11-15 19:45:28,307 - INFO - collecting all words and their counts
2024-11-15 19:45:28,308 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-11-15 19:45:28,334 - INFO - collected 34494 token types (unigram + bigrams) from a corpus of 34635 words and 8 sentences
2024-11-15 19:45:28,334 - INFO - merged Phrases<34494 vocab, min_count=2, threshold=10, max_vocab_size=40000000>
2024-11-15 19:45:28,335 - INFO - Phrases lifecycle event {'msg': 'built Phrases<34494 vocab, min_count=2, threshold=10, max_vocab_size=40000000> in 0.03s', 'datetime': '2024-11-15T19:45:28.335339', 'gensim': '4.3.3', 'python': '3.11.10 (main, Oct  3 2024, 07:29:13) [GCC 11.2

KeyboardInterrupt: 