In [1]:
from tqdm.notebook import tqdm
from nltk import sent_tokenize
from transformers import AutoTokenizer
import sys
import seaborn as sb
sys.path.append('../..')
from spacy.tokens import Span, Doc

from src import *
from src.test_utils import *
# os.environ['TOKENIZERS_PARALLELISM'] = 'false'
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [2]:
# gritlm = GritLM("GritLM/GritLM-7B", device_map="cuda:2", torch_dtype="auto")
retriever = Retriever(device='cuda:1', syn_dist=0.1)
doc_split = DocSplit(retriever.retriever_tokenizer)
# llm = LLM()
llm = 'mistralai/Mistral-7B-Instruct-v0.2'
# llm = None
longdoc = LongDoc(retriever, llm)
# dataset = NarrativeQADataset(llm)
dataset = QualityDataset(llm, split='dev')
# reading_agent = ReadingAgent(dataset, llm)

In [3]:
test_i = 2
sample = dataset.data[test_i]
questions, answers = dataset.get_questions_and_answers(sample)
article = dataset.get_article(sample)
questions = [q.splitlines()[0] for q in questions]
questions

['What is the most likely meaning of the slang O.Q.? (in twentieth-century American English)',
 'Why does the Skipper stop abruptly after he says "when you\'re running a blockade"?',
 'Who or what is Leo?',
 'Why does the Skipper allow the new chef to use the heat-cannon as an incinerator?',
 ' Lieutenant Dugan brings up the examples of "High G" Gordon and "Runt" Hake in order to illustrates that...',
 "Why didn't the Skipper follow the new cook's advice about avoiding Vesta?",
 'Why was the new cook so upset that the Skipper decided to surrender?',
 'What does the Skipper mean by "lady-logic"?',
 "What would've happened if the new cook had told the Skipper about the ekalastron deposits earlier?"]

# Index passages

# Retrieval

## Test Code

In [None]:
pages = doc_split.split_paragraphs(article, 512 // 5)
results, raw = longdoc.index_text_into_map(pages, 3)
write_json('temp.json', [ci.to_json() for ci in results])
write_json('raw.json', raw)

### Test Navigation

In [4]:
pages = doc_split.split_paragraphs(article, 300)
all_summary = longdoc.lossless_index(pages, 1, 1, 1, 'relation')
write_json('all_summary.json', all_summary)

26
3


In [11]:
pages = doc_split.split_paragraphs(article, 300)
summary_prompts = [LongDocPrompt.summary(current_passage) for current_passage in pages]
all_summary = [LongDocPrompt.parse_summary(response[0]) for response in longdoc.llm_server(summary_prompts)]
write_json('all_summary.json', all_summary)

In [4]:
pages = doc_split.split_paragraphs(article, 300)
all_summary = read_json('all_summary.json')

In [12]:
tree = longdoc.build_summary_pyramid(pages, all_summary)
dump_tree('temp_tree.json', tree)

Summarize: 6


Summarize: 2
Summarize: 1


In [4]:
tree = load_tree('temp_tree.json')

In [11]:
len(tree[3])

1

In [12]:
max_indices = retriever.dense_retrieval("Why didn't the Skipper follow the new cook's advice about avoiding Vesta?", [node.children[0] for node in tree[0]], normalize=True)

In [13]:
max_indices

[6, 20, 19, 4, 8]

In [14]:
tree[0][6].children

['Dugan, call McMurtrie and tell him we lift gravs immediately— Slops! What are you doing at that table?" For the little fellow had sidled across the control-room and now, eyes gleaming inquisitively, was peering at our trajectory charts. At the skipper\'s roar he glanced up at us eagerly. "Vesta!" he piped in that curiously high-pitched and mellow voice. "Loft trajectory for Vesta! Then we\'re trying to run the Alliance blockade, Captain?" "None of your business!" bellowed O\'Hara in tones of thunderous outrage. "Get below instantly, or by the lavendar lakes of Luna I\'ll—" "If I were you," interrupted our diminutive new chef thoughtfully, "I\'d try to broach the blockade off Iris rather than Vesta. For one thing, their patrol line will be thinner there; for another, you can come in through the Meteor Bog, using it as a cover." " Mr. Dugan! " The Old Man\'s voice had an ominous ring to it, one I had seldom heard. I sprang to attention and saluted smartly. "Aye, sir?"']

In [6]:
def get_parents(node:MyNode):
    parents:List[MyNode] = []
    cur_node = node
    while cur_node.parent is not None:
        parents.append(cur_node.parent)
        cur_node = cur_node.parent
    return parents

In [7]:
def pring_branch_from_leaf(node:MyNode):
    parents = get_parents(node)
    infos = []
    infos.append(('chunk', node.children[0]))
    infos.append(('summary', node.summary))
    for pid, p in enumerate(parents):
        infos.append((f'summary_{pid+1}', p.summary))
    return infos

In [10]:
tree[0][6].summary

''

In [13]:
pring_branch_from_leaf(tree[0][6])

[('chunk',
  'Dugan, call McMurtrie and tell him we lift gravs immediately— Slops! What are you doing at that table?" For the little fellow had sidled across the control-room and now, eyes gleaming inquisitively, was peering at our trajectory charts. At the skipper\'s roar he glanced up at us eagerly. "Vesta!" he piped in that curiously high-pitched and mellow voice. "Loft trajectory for Vesta! Then we\'re trying to run the Alliance blockade, Captain?" "None of your business!" bellowed O\'Hara in tones of thunderous outrage. "Get below instantly, or by the lavendar lakes of Luna I\'ll—" "If I were you," interrupted our diminutive new chef thoughtfully, "I\'d try to broach the blockade off Iris rather than Vesta. For one thing, their patrol line will be thinner there; for another, you can come in through the Meteor Bog, using it as a cover." " Mr. Dugan! " The Old Man\'s voice had an ominous ring to it, one I had seldom heard. I sprang to attention and saluted smartly. "Aye, sir?"'),
 (

In [16]:
len(tree[-2])

2

In [17]:
tree[-2][1].summary

"In the passage, Slops, the cook on Vesta, warns the skipper about the increased risk of pirate raids due to recent riches in ekalastron deposits and Vesta's orbit entering aphelion stage. The skipper is suspicious of Slops for not sharing this information earlier and becomes enraged. The crew's assessment of the risks was off, leading to significant consequences. Their next challenge was navigating past Callisto, as the discovery of their presence by the Callistans could lead to the occupation of Callisto by their adversaries, exposing hidden information. However, it was too late to change course as their lock had already been opened, and the sound of approaching Alliance soldiers was heard on the metal ramp. The Alliance commander stood before them, expressing satisfaction at their impending surrender."

In [8]:
tree[0][0].children

['CAPTAIN CHAOS By NELSON S. BOND The Callisto-bound Leo needed a cook. What it got was a piping-voiced Jonah who jinxed it straight into Chaos. [Transcriber\'s Note: This etext was produced from Planet Stories Summer 1942. Extensive research did not uncover any evidence that the U.S. copyright on this publication was renewed.] We picked up our new cook on Phobos. Not Phoebus or Phoebe; I mean Phobos, Mars\' inner moon. Our regular victual mangler came down with acute indigestion—tasted some of his own cooking, no doubt—when we were just one blast of a jet-tube out of Sand City spaceport. But since we were rocketing under sealed orders, we couldn\'t turn back. So we laid the Leo down on Phobos\' tiny cradle-field and bundled our ailing grub-hurler off to a hospital, and the skipper said to me, "Mister Dugan," he said, "go out and find us a cook!" "Aye, sir!" I said, and went. Only it wasn\'t that easy. In those days, Phobos had only a handful of settlers, and most of them had good-payi

In [18]:
longdoc.llm_server(f'Passage:\n{tree[0][6].children[0]}\n\nQuestion:\n{questions[5]}\n\nDo you think the above passage is helpful to answer the above question? Explain your answer.')

[[" Yes, the passage is helpful to answer the question. The Skipper, O'Hara, did not follow the new cook's advice about avoiding Vesta because he was angry that the cook, Dugan, had been snooping around the control room and looking at their trajectory charts without permission. O'Hara viewed this as a breach of trust and a potential security risk, so he ordered Dugan to get below deck. In the heat of the moment, O'Hara did not consider the strategic advantages that Dugan had suggested, such as a thinner patrol line and the use of the Meteor Bog as cover. Instead, he focused on punishing Dugan for his disobedience."]]

In [20]:
# passage = tree[0][6].children[0]
# question = questions[5]
passage = 'Bob died in Champaign.'
question = 'Where did Bob spend his childhood?'
longdoc.llm_server(f'Passage:\n{passage}\n\nQuestion:\n{question}\n\nDo you think the above passage is helpful to answer the above question? Explain your answer.')

[[' The passage does not provide any information about where Bob spent his childhood. The only information given is that Bob died in Champaign. Therefore, the passage is not helpful in answering the question.']]

In [None]:
test_node = tree[-2][1]

In [None]:
test_node.children

In [None]:
test_node.summary

### TextGraph

In [None]:
def remove_unimportant(doc:Span, additional_pos_labels:Set[str]=set()):
    spans = []
    temp_span_start = 0
    tid = 0
    while tid < len(doc):
        t = doc[tid]
        if t.pos_ in {'DET', 'PRON', 'CCONJ', 'PUNCT', 'AUX', 'PART'} or t.pos_ in additional_pos_labels:
            if temp_span_start != tid:
                spans.append((temp_span_start, tid))
            temp_span_start = tid + 1
        tid += 1
    if temp_span_start < tid:
        spans.append((temp_span_start, tid))
    splitted_doc = [doc[span[0]:span[1]] for span in spans]
    return splitted_doc

def collect_keywords_from_text(doc:Doc):
    ncs = list(doc.noun_chunks)
    ents = doc.ents
    nc_id, eid = 0, 0
    spans:List[Span] = []
    # Merge noun chunks with entities
    while nc_id < len(ncs) and eid < len(ents):
        nc, ent = ncs[nc_id], ents[eid]
        if set(range(nc.start, nc.end)).intersection(range(ent.start, ent.end)):
            spans.append(doc[min(nc.start, ent.start) : max(nc.end, ent.end)])
            nc_id += 1
            eid += 1
        else:
            if nc.start < ent.end:
                spans.append(nc)
                nc_id += 1
            else:
                spans.append(ent)
                eid += 1
    spans.extend(ncs[nc_id:])
    spans.extend(ents[eid:])
    # Update each noun chunks
    updated_spans:List[Span] = []
    for span in spans:
        updated_spans.extend(remove_unimportant(span, {'ADJ', 'ADV'}))
    ent_candidates = {' '.join([t.lemma_ for t in span]) for span in updated_spans}
    return ent_candidates
    

In [None]:
class TextGraph:
    def __init__(self, docs:List[Doc]) -> None:
        self.text_graph = nx.DiGraph()
        self.ent_graph = nx.Graph()
        self.tokenized_corpus:List[List[str]] = []
        ent_pair_counter = Counter()
        for pid, doc in enumerate(docs):
            tokenized_page = [t.lemma_.lower() for t in doc]
            nouns = collect_keywords_from_text(doc)
            if len(nouns) >= 2:
                ent_pair_counter.update(map(frozenset, itertools.combinations(nouns, 2)))
            self.tokenized_corpus.append(tokenized_page)
            self.text_graph.add_node(pid, tokenized_page=tokenized_page, nouns=nouns)
        for (ent1, ent2), cnt in ent_pair_counter.items():
            self.ent_graph.add_edge(ent1, ent2, log_freq=np.log(cnt+1))
        self.ent_general_importance:Dict[str, float] = nx.pagerank(self.ent_graph, weight='log_freq')
        self.bm25 = BM25Okapi(self.tokenized_corpus)
        for pid1 in range(len(docs)):
            bm25_scores = self.bm25.get_scores(self.tokenized_corpus[pid1])
            bm25_scores = bm25_scores / bm25_scores.sum()
            nouns1:Set[str] = self.text_graph.nodes[pid1]['nouns']
            for pid2 in range(len(docs)):
                if pid1 != pid2:
                    overlap = nouns1.intersection(self.text_graph.nodes[pid2]['nouns'])
                    if overlap:
                        ent_importance = sum([self.ent_general_importance[ent] for ent in overlap])
                        dist = 1 / np.log(np.e + np.abs(pid2 - pid1))
                        bm25_score = bm25_scores[pid2]
                        weight = statistics.harmonic_mean([ent_importance, bm25_score]) * dist
                        self.text_graph.add_edge(pid1, pid2, overlap=overlap, ent_importance=ent_importance, dist=dist, bm25_score=bm25_score, weight=weight)
        

In [None]:
tg = TextGraph([longdoc.nlp(p) for p in all_summary])

In [None]:
list(tg.text_graph.edges.data())[:5]

### Topic Modeling

In [None]:
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_string, DEFAULT_FILTERS
from gensim.models import Phrases, CoherenceModel, LdaModel, EnsembleLda, LdaMulticore

In [None]:
pages = doc_split.split_paragraphs(article, 500)
all_summary = read_json('all_summary.json')

In [None]:
longdoc.llm_server(f'''
Summarize the following passage.

Passage:
{pages[1]}
''')

In [None]:
longdoc.llm_server(f'''
Summarize the following passage.

Passage:
{pages[2]}
''')

In [None]:
longdoc.llm_server(f'''
What are the common information in the following 2 passages.

Passage 1:
{pages[1]}

Passage 2:
{pages[2]}
''')

In [None]:
longdoc.llm_server(f'''
What are the different information between the following 2 passages.

Passage 1:
{pages[1]}

Passage 2:
{pages[2]}
''')

In [None]:
pages[2]

In [None]:
len(all_summary)

In [None]:
preprocess_funcs = DEFAULT_FILTERS[:-1] # Remove the stemming
preprocessed_summary = [preprocess_string(' '.join([t.lemma_ for t in longdoc.nlp(p, disable=['parser', 'ner'])]), preprocess_funcs) for p in all_summary]

# bigram = Phrases(preprocessed_summary, min_count=2, threshold=1)

# texts = [bigram[p] for p in preprocessed_summary]
texts = preprocessed_summary

# Create a dictionary from the corpus
dictionary = corpora.Dictionary(texts)

# Remove low-frequency terms from the dictionary
dictionary.filter_extremes(no_below=2)

# Convert the corpus into a bag-of-words representation
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
lda_model = EnsembleLda(
    corpus=corpus, 
    id2word=dictionary, 
    passes=5, 
    iterations=100, 
    num_models=5, 
    # min_cores=10, 
    # min_samples=4,
    epsilon=0.05
    )

In [None]:
lda_model.print_topics()

In [None]:
topic2p = defaultdict(list)
for pid, p in enumerate(corpus):
    topic_id = sorted(lda_model[p], key=lambda x: x[1])[-1][0]
    topic2p[topic_id].append(all_summary[pid])
print(lda_model.stable_topics.shape)
print([(tid, len(topic2p[tid])) for tid in range(len(topic2p))])

In [None]:
topic2p[0]

In [None]:
topics = []
score = []
topic_models:Dict[int, LdaModel] = {}
min_docs_per_topic = 4
for topic_num in tqdm(range(4, len(all_summary) // min_docs_per_topic, 4)):
    # Build the LDA model
    lda_model = LdaMulticore(corpus, topic_num, dictionary, iterations=100, passes=5, workers=5)
    cm = CoherenceModel(lda_model, texts = texts, corpus=corpus, dictionary=dictionary, coherence='c_v')
    topics.append(topic_num)
    score.append(cm.get_coherence())
    topic_models[topic_num] = lda_model
    
plt.plot(topics, score)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.show()

In [None]:
lda_model = topic_models[44]

In [None]:
list(lda_model.get_document_topics(corpus))

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))