In [7]:
import llm
import sqlite_utils
import chromadb
import json
import pandas as pd

## Get Book Metadata Lookup

In [63]:
mdf = pd.read_json("top100_metadata_short.json")

In [17]:
def get_meta_lookup(file="top100_metadata_short.json"): 
    metadata = pd.read_json(file)
    metadata = metadata.to_dict(orient="records")
    metadata_lookup = {x['id']:x for x in metadata}
    return metadata_lookup

In [35]:
metadata = get_meta_lookup()

In [37]:
metadata.keys()

dict_keys([2852, 1080, 6593, 5197, 11, 8800, 1727, 1342, 844, 1184, 1400, 28255, 37106, 98, 408, 84, 1952, 74, 174, 345, 76, 16389, 1497, 2591, 244, 2641, 100, 514, 2160, 730, 1837, 8710, 86, 1232, 4085, 67098, 1259, 1513, 398, 4280, 43, 5200, 27827, 2701, 6130, 3296, 135, 16328, 2554, 768, 74750, 74745, 45, 145, 36034, 34901, 74742, 67979, 7700, 12116, 600, 28054, 7370, 2814, 55, 2680, 25344, 394, 219, 2542, 8492, 4300, 50150, 2600, 1661, 1998, 1260, 205, 46, 10007, 3207, 74746, 64317, 4363, 6761, 41, 38015, 996])

## Search DB

In [3]:
model = "gguf/mxbai-embed-xsmall-v1-q8_0"
embedding_model = llm.get_embedding_model(model)

In [67]:
client = chromadb.PersistentClient(path="chroma_db/")
collection = client.get_collection("top100")

In [None]:
def embed_text(text):
    return embedding_model.embed(text)

def search_chroma(string, keyword=None, n=10):
    vector = embed_text(string)
    if keyword:
        res = collection.query(
            query_embeddings=[vector],
            n_results=n,
            #where={"metadata_field": "is_equal_to_this"},
            where_document={"$contains":keyword}
        )
    else:
        res = collection.query(
            query_embeddings=[vector],
            n_results=n,
            #where={"metadata_field": "is_equal_to_this"},
            #where_document={"$contains":keyword}
        )
    return res

def search_chroma_with_vector(vector,n=10):
    res = collection.query(
            query_embeddings=[vector],
            n_results=n,
            #where={"metadata_field": "is_equal_to_this"},
            #where_document={"$contains":keyword}
        )
    return res

def combine_res(res, docs=True):
    combined = []
    ids = res['ids'][0]
    meta = res['metadatas'][0]
    distance = res['distances'][0]
    docs = res['documents'][0]
    for i, id in enumerate(ids):
        rowid = id
        alldata = meta[i]
        alldata['rowid'] = rowid
        alldata['distance'] = distance[i]
        alldata['words'] = word_count(docs[i])
        bookid = int(rowid.split("_")[0])
        info = metadata[bookid]
        combined.append({**alldata, **info})
    return combined

def word_count(string):
    return len(string.split(" "))

In [70]:
combine_res(search_chroma("They lived happily ever after.", n=2))

[{'bookid': 514,
  'content': "They were very happy, even after they discovered that they couldn't live on love alone.\n",
  'line_num': 1460,
  'source': '514_sents_filt.txt',
  'rowid': '514_1460',
  'distance': 0.2782585620880127,
  'words': 15,
  'id': 514,
  'title': 'Little Women',
  'author': 'Alcott, Louisa May',
  'author_birthday': 1832},
 {'bookid': 2591,
  'content': 'There they found their child, now grown up to be comely and fair; and after all their troubles they lived happily together to the end of their days.\n',
  'line_num': 1596,
  'source': '2591_sents_filt.txt',
  'rowid': '2591_1596',
  'distance': 0.29412078857421875,
  'words': 28,
  'id': 2591,
  'title': "Grimms' Fairy Tales",
  'author': 'Grimm, Jacob, Grimm, Wilhelm',
  'author_birthday': 1785}]

In [None]:
def runner_from_start(string, words=25000, wordcount=0, code="start"):
    # start string, run till word count, label with a code
    seen_ids = []
    sentences = []
    sentences.append({'content': string, 'code': 'input', 'sent_count': 0, 'author': "narrator", "title": "nanogenmo"})
    while wordcount < words:
        results = combine_res(search_chroma(string, n=15))
        got = False
        for res in results:
            if res['rowid'] not in seen_ids:
                res['origin_search'] = string
                res['code'] = code
                number = len(sentences)
                res['sent_count'] = number
                sentences.append(res)
                seen_ids.append(res['rowid'])
                wordcount += res['words']
                string = res['content']
                got = True
                continue
        if not got:
            print("blocked")
            break
    return seen_ids, sentences, wordcount

def interpolate_vectors(v1, v2, weight):
    """
    Interpolate between two vectors with given weight.
    weight = 0 returns v1, weight = 1 returns v2
    """
    return [a + (b - a) * weight for a, b in zip(v1, v2)]

def generate_interpolations(v1, v2, n):
    """
    Generate n interpolations between v1 and v2.
    Returns list of vectors, starting at v1 and ending at v2.
    """
    weights = [i/(n-1) for i in range(n)]
    return [interpolate_vectors(v1, v2, w) for w in weights]

def merger(sent1, sent2, n=10):
    # interpolate between 2 sentences to join the parts of the story -- n sentences
    seen_ids = []
    sentences = []
    wordcount = 0
    vector1 = embed_text(sent1)
    vector2 = embed_text(sent2)
    interps = generate_interpolations(vector1, vector2, n=n)
    # don't use these sentences again:
    res1 = combine_res(search_chroma(sent1, n=1))
    seen_ids.append(res1[0]['id'])
    res2 = combine_res(search_chroma(sent2, n=1))
    seen_ids.append(res2[0]['id'])
    for i, interp in enumerate(interps):
        results = combine_res(search_chroma_with_vector(interp, n=15))
        got = False
        for res in results:
            if res['rowid'] not in seen_ids:
                res['origin_search'] = "None"
                res['code'] = "interp_" + str(i)
                number = len(sentences)
                res['sent_count'] = number
                sentences.append(res)
                seen_ids.append(res['rowid'])
                wordcount += res['words']
                string = res['content']
                got = True
                break
        if not got:
            print("blocked")
            break
    return seen_ids, sentences, wordcount
    

## Create and Write Output File

In [150]:
seen_ids1, sentences1, wordcount = runner_from_start("Once upon a time, in a faraway land...\n")

In [151]:
wordcount

25128

In [127]:
print(sentences[-1])

{'bookid': 6593, 'content': 'These solicitations were nevertheless unsuccessful: for though Mr Allworthy did not think, with some late writers, that mercy consists only in punishing offenders; yet he was as far from thinking that it is proper to this excellent quality to pardon great criminals wantonly, without any reason whatever.\n', 'line_num': 290, 'source': '6593_sents_filt.txt', 'rowid': '6593_290', 'distance': 0.45186787843704224, 'words': 47, 'id': 6593, 'title': 'History of Tom Jones, a Foundling', 'author': 'Fielding, Henry', 'author_birthday': 1707, 'origin_search': 'Several strange facts combined against her, which might have staggered anyone who had not such proof of her innocence as I had.\n'}


In [152]:
seen_ids2, sentences2, wordcount = runner_from_start("And they lived happily ever after.\n", code="end")

In [153]:
sentences2[-1]

{'bookid': 3296,
 'content': 'Yet the force of truth did of itself flash into mine eyes, and I turned away my panting soul from incorporeal substance to lineaments, and colours, and bulky magnitudes.\n',
 'line_num': 368,
 'source': '3296_sents_filt.txt',
 'rowid': '3296_368',
 'distance': 0.42306387424468994,
 'words': 29,
 'id': 3296,
 'title': 'The Confessions of St. Augustine',
 'author': 'Augustine, Saint, Bishop of Hippo',
 'author_birthday': 354,
 'origin_search': "That lie shall lie so heavy on my sword That it shall render vengeance and revenge Till thou the lie-giver and that lie do lie In earth as quiet as thy father's skull.\n",
 'code': 'end',
 'sent_count': 987}

In [154]:
sentences2.reverse()

In [155]:
sentences2[-1]

{'content': 'And they lived happily ever after.\n',
 'code': 'input',
 'sent_count': 0,
 'author': 'narrator',
 'title': 'nanogenmo'}

In [130]:
start_merge = sentences1[-1]['content']
start_merge

'These solicitations were nevertheless unsuccessful: for though Mr Allworthy did not think, with some late writers, that mercy consists only in punishing offenders; yet he was as far from thinking that it is proper to this excellent quality to pardon great criminals wantonly, without any reason whatever.\n'

In [132]:
end_merge = sentences2[0]['content']
end_merge

'Yet the force of truth did of itself flash into mine eyes, and I turned away my panting soul from incorporeal substance to lineaments, and colours, and bulky magnitudes.\n'

In [133]:
seen_ids3, sentences_merged, wordcount = merger(start_merge, end_merge, n=10)

In [157]:
all_sentences = sentences1 + sentences_merged + sentences2

In [171]:
def write_out_sentences(sentences, filename="output.md", title="Once Upon a Time/Happily Ever After"):
    with open(filename, 'w') as f:
        # Title
        f.write(f"# {title}\n\n")
        
        for sentence in sentences:
            
            f.write(sentence['content'].strip())
            
            f.write(f"<i><div align='right'>{sentence.get('author', '')}, {sentence.get('title', '')}, distance:{sentence.get('distance', 0):.3f}, code:{sentence.get('code', 'none')}</div></i>\n")
            f.write("\n")
            # Centered text using HTML
            #f.write("<div align='center'>Centered text</div>\n\n")
        
            # Block quote
            #f.write("> Indented blockquote\n\n")
    print("Wrote file", filename)

In [172]:
write_out_sentences(all_sentences)

Wrote file output.md


## Demo of the Interp

In [120]:
seen_ids3, sentences_merged, wordcount = merger("It was night and chilly.", "It was sunny and hot.", n=5)

In [121]:
len(sentences_merged)

5

In [122]:
sentences_merged

[{'bookid': 1400,
  'content': 'It was a dry cold night, and the wind blew keenly, and the frost was white and hard.\n',
  'line_num': 270,
  'source': '1400_sents_filt.txt',
  'rowid': '1400_270',
  'distance': 0.28353387117385864,
  'words': 18,
  'id': 1400,
  'title': 'Great Expectations',
  'author': 'Dickens, Charles',
  'author_birthday': 1812,
  'origin_search': 'None',
  'code': 'interp_0',
  'sent_count': 0},
 {'bookid': 4300,
  'content': 'Though it was a warm pleasant sort of a night now yet wonderfully cool for the season considering, for sunshine after storm.\n',
  'line_num': 3531,
  'source': '4300_sents_filt.txt',
  'rowid': '4300_3531',
  'distance': 0.288249671459198,
  'words': 22,
  'id': 4300,
  'title': 'Ulysses',
  'author': 'Joyce, James',
  'author_birthday': 1882,
  'origin_search': 'None',
  'code': 'interp_1',
  'sent_count': 1},
 {'bookid': 2814,
  'content': 'It was a bright Sunday morning of early summer, promising heat, but with a fresh breeze blowing.\