*.ipynb filter=strip-notebook-output  

In [2]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string
import json 
import logging
from pathlib import Path
import sys
sys.path.append(str(Path.cwd().parent.parent))
from utils import create_fh_logger

In [5]:
# locations of json files + a place to store a log
src = Path.cwd().parent.parent.parent.parent  / 'processing' / 'nro_declassified' / 'tokenized'
dst = src.parent / 'topics'
files = list(src.glob('*json'))
logs = src.parent.parent / 'logs'
logs.mkdir(exist_ok=True)
logger = create_fh_logger(logs / "topic_model.log")

In [50]:
documents = []
stopwords = ['shall', 'camera', 'corona', 'mission', 'film']
for file in files:
    with open(file, 'r') as f:
        data = json.load(f)
    words = [item['word'] for item in data if item['word'] not in stopwords] # more edge cases
    documents.append(preprocess_string((' ').join(words))) # some edge cases I still missed, letting gensim pick up the slack here

In [73]:
dictionary = Dictionary(documents)
dictionary.filter_extremes(no_below=0.05, no_above=0.8)
corpus = [dictionary.doc2bow(doc) for doc in documents]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4)

In [74]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

Topic: 0 
Words: 0.009*"control" + 0.006*"frame" + 0.006*"us" + 0.006*"oper" + 0.006*"color" + 0.006*"imag" + 0.005*"time" + 0.005*"resolut" + 0.005*"test" + 0.005*"pass"

Topic: 1 
Words: 0.006*"oper" + 0.006*"frame" + 0.005*"test" + 0.005*"data" + 0.005*"resolut" + 0.005*"time" + 0.005*"control" + 0.005*"imag" + 0.004*"flight" + 0.004*"exposur"

Topic: 2 
Words: 0.007*"frame" + 0.006*"time" + 0.005*"vehicl" + 0.005*"control" + 0.005*"oper" + 0.004*"pass" + 0.004*"imag" + 0.004*"figur" + 0.004*"rev" + 0.004*"us"

Topic: 3 
Words: 0.011*"program" + 0.007*"test" + 0.006*"oper" + 0.005*"launch" + 0.005*"vehicl" + 0.005*"us" + 0.005*"satellit" + 0.005*"control" + 0.005*"recoveri" + 0.004*"air"



Topic 0 is a strange combination of memos, performance evaluation reports, and tech docs.

Topic 1 is mostly flight data books and performance evaluation reports -> more operational in nature.

Topic 2 seems to be strongly influenced by the presence of `figure`, likely figures. It contains several manuals and documents.

Topic 3 is mostly a variety of memos that seem to be focused on interactions with the public.

In [None]:
for document_num, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    # Optionally, sort the topics by their contribution to this document
    doc_topics_sorted = sorted(doc_topics, key=lambda x: x[1], reverse=False)
    for topic_num, prop_topic in doc_topics_sorted:
        # Print the top topic and its contribution
        if topic_num == 2 and prop_topic>.9:
            print(f"Document {document_num} Top Topics: {doc_topics}")
            print(f'File: {files[document_num]}')
            print(f"Top topic: {topic_num}, Contribution: {prop_topic:.4f}")
            # Print top words for this topic
            word_contributions = lda_model.show_topic(topic_num)
            print(f"Top words for top topic: {word_contributions}")
            print("\n")