# Building a book

## Load text

In [56]:
import spacy
nlp_ru = spacy.load("ru_core_news_lg")
def split_into_chunks(text: str, max_chars: int = 300) -> list:
    doc = nlp_ru(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    chunks = []
    current_chunk = ""
    for sent in sentences:
        if len(current_chunk) + len(sent) <= max_chars:
            current_chunk += (" " if current_chunk else "") + sent
        else:
            chunks.append(current_chunk)
            current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [57]:
from pathlib import Path
def load_text_from_file(filepath: str) -> str:
    return Path(filepath).read_text(encoding='utf-8')
text_ru = load_text_from_file("./merge/text_ru.txt")
chunks_ru = split_into_chunks(text_ru.replace("\n", " "), max_chars=300)

In [58]:
len(chunks_ru)

2919

## Create dataframe

In [59]:
import pandas as pd
rus_df = pd.DataFrame({ "chunk": chunks_ru })

In [60]:
rus_df.shape

(2919, 1)

## Add column for translation

In [61]:
from googletrans import Translator

translator = Translator()
async def translate_russian_to_spanish(text: str) -> str:
    result = await translator.translate(text, src="ru", dest="es")
    return result.text

In [62]:
rus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   chunk   2919 non-null   object
dtypes: object(1)
memory usage: 22.9+ KB


In [63]:
import asyncio
import nest_asyncio
nest_asyncio.apply()
from concurrent.futures import ThreadPoolExecutor

In [64]:
from tqdm.asyncio import tqdm_asyncio
# Wrapper to run async function in thread-friendly context
async def translate_all(chunks, max_workers=10):
    semaphore = asyncio.Semaphore(max_workers)
    async def run_chunk(text):
        async with semaphore:
            return await translate_russian_to_spanish(text)

    # Run in batches
    tasks = [run_chunk(text) for text in chunks]
    return await tqdm_asyncio.gather(*tasks)

In [65]:
# Run translation asynchronously and assign to new column
translations = asyncio.run(translate_all(rus_df["chunk"].tolist(), max_workers=20))
rus_df["translation"] = translations

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2919/2919 [01:27<00:00, 33.45it/s]


In [66]:
rus_df.tail(3)

Unnamed: 0,chunk,translation
2916,–ú–∞–∫–æ–Ω–¥–æ –±—ã–ª —É–∂–µ –ø–æ—á—Ç–∏ –≤–µ—Å—å –ø–µ—Ä–µ–º–æ–ª–æ—Ç –≤ –ø—ã–ª—å –∏ ...,Macondo ya estaba casi abrumado en el polvo y ...
2917,"–°–∫–æ–ª—å–∑–Ω—É–ª –≤–∑–≥–ª—è–¥–æ–º –Ω–∏–∂–µ, —á—Ç–æ–±—ã –ø—Ä–æ–ø—É—Å—Ç–∏—Ç—å –ø—Ä–µ–¥...",Mir√≥ a continuaci√≥n para perder las prediccion...
2918,"–û–¥–Ω–∞–∫–æ –ø—Ä–µ–∂–¥–µ —á–µ–º –≤–∑–≥–ª—è–Ω—É—Ç—å –Ω–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–π —Å—Ç–∏—Ö,...","Sin embargo, antes de mirar el √∫ltimo vers√≠cul..."


In [67]:
rus_df.to_pickle("./merge/rus_df.pkl")

## Add embeddings

In [68]:
from dotenv import load_dotenv
load_dotenv()
from openai import AsyncOpenAI
client = AsyncOpenAI(max_retries=5)
from typing import List
async def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = await client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding

In [69]:
from tqdm.asyncio import tqdm_asyncio
# Wrapper to run async function in thread-friendly context
async def embed_all(chunks, max_workers=10):
    semaphore = asyncio.Semaphore(max_workers)
    async def run_chunk(text):
        async with semaphore:
            return await get_embedding(text)

    # Run in batches
    tasks = [run_chunk(text) for text in chunks]
    return await tqdm_asyncio.gather(*tasks)

In [70]:
rus_df.shape

(2919, 2)

In [71]:
subset = rus_df["translation"].tolist()
embeddings = asyncio.run(embed_all(subset, max_workers=20))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2919/2919 [00:52<00:00, 56.10it/s]


In [72]:
[len(embeddings), rus_df.shape]

[2919, (2919, 2)]

In [73]:
rus_df["embedding"] = embeddings

In [74]:
rus_df.to_pickle("./merge/rus_df.pkl")

In [75]:
rus_df.head(1)

Unnamed: 0,chunk,translation,embedding
0,"‚Äî –Ø —Ö–æ—á—É –±—ã—Ç—å —Ç–æ–ª—å–∫–æ —Å —Ç–æ–±–æ–π, ‚Äî –∑–∞—è–≤–∏–ª –æ–Ω. ‚Äî –°...","""Solo quiero estar contigo"", dijo. - Pronto le...","[0.02256019599735737, -0.029076769948005676, -..."


## Get target edition chunks

In [76]:
from more_itertools import split_at
def break_long_sentences(doc):
    sublists = list(" ".join(line) for line in split_at([d.text for d in doc], lambda x: x == ","))
    return [chunk + ',' if i < len(sublists) - 1 else chunk for i, chunk in enumerate(sublists)]

In [79]:
from itertools import chain
import es_core_news_sm
nlp_es = es_core_news_sm.load()
def get_es_chunks(text: str) -> list:
    doc = nlp_es(text)

    chunks = []
    current_chunk = ""
    parts = [break_long_sentences(s) for s in doc.sents]
    return list(chain.from_iterable(parts))

In [80]:
text_es = load_text_from_file("./merge/text_es.txt")
chunks_es = get_es_chunks(text_es.replace("\n", " "))

In [81]:
len(chunks_es)

13442

In [82]:
df_es = pd.DataFrame({ "chunk": chunks_es })

In [83]:
df_es.shape

(13442, 1)

In [None]:
df_es.to_pickle("./merge/es_df.pkl")

In [120]:
rus_df['match'] = None
rus_df.loc[0]

chunk          ‚Äî –Ø —Ö–æ—á—É –±—ã—Ç—å —Ç–æ–ª—å–∫–æ —Å —Ç–æ–±–æ–π, ‚Äî –∑–∞—è–≤–∏–ª –æ–Ω. ‚Äî –°...
translation    "Solo quiero estar contigo", dijo. - Pronto le...
embedding      [0.02256019599735737, -0.029076769948005676, -...
match                                                       None
Name: 0, dtype: object

In [109]:
rus_df['embedding'][:3]

0    [0.02256019599735737, -0.029076769948005676, -...
1    [0.0629948303103447, -0.008339416235685349, -0...
2    [0.05613021180033684, -0.00419988390058279, 0....
Name: embedding, dtype: object

In [106]:
import numpy as np
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [110]:
async def get_matching_fragment(embed, pointer):
    i=pointer+1
    last_score = -1
    while 1:
        score = cosine_similarity(
            embed,
            await get_embedding(" ".join(chunks_es[pointer:i]))
        )
        if (score < last_score):
            break
        last_score = score
        i += 1
    return (" ".join(chunks_es[pointer:i-1]), i-1, last_score)

In [127]:
pointer = 0
for (i, ru_embed) in enumerate(rus_df['embedding']):
    (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
    retries = 0
    while (score < 0.6) and (retries < 5):
        print(f"üîÅ Retracing for fragment {i} | pointer {pointer} |Ô∏è {score}")
        pointer -= 1
        (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
        retries += 1
    pointer += retries
    retries = 0
    while (score < 0.6) and (retries < 5):
        print(f"üîÅ Retracing for fragment {i} | pointer {pointer} |Ô∏è {score}")
        pointer += 1
        (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
        retries += 1
    if (score < 0.6):
        rus_df.to_pickle("./merge/rus_df_matched.pkl")
        break
    pointer = new_pointer
    rus_df.loc[i, "match"] = es_chunk
    if i % 10 == 0:
        rus_df.to_pickle("./merge/rus_df_matched.pkl")
        print(f"‚úÖ Data stored to file")
    print(f"‚úÖ fragment {i} |Ô∏è {score}")

‚úÖ Data stored to file
‚úÖ fragment 0 |Ô∏è 0.7487427146261971
‚úÖ fragment 1 |Ô∏è 0.8651565082225524
‚úÖ fragment 2 |Ô∏è 0.6724228675231383
‚úÖ fragment 3 |Ô∏è 0.648529889039089
‚úÖ fragment 4 |Ô∏è 0.6051339958340011
‚úÖ fragment 5 |Ô∏è 0.7873766153073934
‚úÖ fragment 6 |Ô∏è 0.7354530715335702
‚úÖ fragment 7 |Ô∏è 0.7372134644935845
‚úÖ fragment 8 |Ô∏è 0.8817601594716075
‚úÖ fragment 9 |Ô∏è 0.8325358072585453
‚úÖ Data stored to file
‚úÖ fragment 10 |Ô∏è 0.8162378863515477
üîÅ Retracing for fragment 11 | pointer 51 |Ô∏è 0.4365744334970984
üîÅ Retracing for fragment 11 | pointer 50 |Ô∏è 0.4596365200876108
‚úÖ fragment 11 |Ô∏è 0.7607178223732834
‚úÖ fragment 12 |Ô∏è 0.7160444095225752
‚úÖ fragment 13 |Ô∏è 0.6442303363845537
‚úÖ fragment 14 |Ô∏è 0.6377902251924785
‚úÖ fragment 15 |Ô∏è 0.7045976805923272
‚úÖ fragment 16 |Ô∏è 0.7644475524734028
‚úÖ fragment 17 |Ô∏è 0.809622991553264
‚úÖ fragment 18 |Ô∏è 0.7864550272925133
‚úÖ fragment 19 |Ô∏è 0.8013021441533662
üîÅ Retracing for fragment 

In [117]:
rus_df = pd.read_pickle("./merge/rus_df.pkl")

In [122]:
rus_df.shape

(2919, 4)

In [129]:
[rus_df.loc[127, "chunk"], rus_df.loc[127, "match"]]

['–û–Ω–∏ –±–æ–ª—Ç–∞–ª–∏ –±–µ–∑ —É–º–æ–ª–∫—É, –ø–µ—Ä–µ–±–∏–≤–∞—è –¥—Ä—É–≥ –¥—Ä—É–≥–∞, —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–ª–∏ –¥–æ –æ–±–∞–ª–¥–µ–Ω–∏—è –æ–¥–Ω–∏ –∏ —Ç–µ –∂–µ —Å—Ç–∞—Ä—ã–µ –∞–Ω–µ–∫–¥–æ—Ç—ã, —É—á–∞—Å—Ç–≤–æ–≤–∞–ª–∏ –≤ —à—É—Ç–µ–π–Ω–æ–º –¥–µ–π—Å—Ç–≤–µ –ø—Ä–æ –±–µ–ª–æ–≥–æ –±—ã—á–∫–∞, –∞ –µ—Å–ª–∏ –≤—Å–µ –º–æ–ª—á–∞–ª–∏, —Ä–∞—Å—Å–∫–∞–∑—á–∏–∫ –≥–æ–≤–æ—Ä–∏–ª, —á—Ç–æ –æ–Ω –ø—Ä–æ—Å–∏–ª –Ω–µ –º–æ–ª—á–∞—Ç—å, –∞ —Å–∫–∞–∑–∞—Ç—å, —Ö–æ—Ç—è—Ç –ª–∏ –æ–Ω–∏ —Å–ª—É—à–∞—Ç—å —Å–∫–∞–∑–∫—É –ø—Ä–æ –±–µ–ª–æ–≥–æ –±—ã—á–∫–∞, –∏ –Ω–∏–∫—Ç–æ –Ω–µ –º–æ–≥ —É–π—Ç–∏, –ø–æ—Ç–æ–º—É —á—Ç–æ —Ä–∞—Å—Å–∫–∞–∑—á–∏–∫ –≥–æ–≤–æ—Ä–∏–ª, —á—Ç–æ –æ–Ω –Ω–µ –ø—Ä–æ—Å–∏–ª —É—Ö–æ–¥–∏—Ç—å, –∞ –ª–∏—à—å –æ—Ç–≤–µ—Ç–∏—Ç—å, —Ö–æ—Ç—è—Ç –ª–∏ –æ–Ω–∏ —Å–ª—É—à–∞—Ç—å —Å–∫–∞–∑–∫—É –ø—Ä–æ –±–µ–ª–æ–≥–æ –±—ã—á–∫–∞, –∏ —Ç–∞–∫ –±–µ–∑ –∫–æ–Ω—Ü–∞, –≤—Å–µ –Ω–æ—á–∏ –Ω–∞–ø—Ä–æ–ª–µ—Ç, –∑–∞–≥–Ω–∞–≤ —Å–µ–±—è –≤ –ø–æ—Ä–æ—á–Ω—ã–π –∫—Ä—É–≥ –ø—É—Å—Ç–æ–ø–æ—Ä–æ–∂–Ω–∏—Ö —Ñ—Ä–∞–∑.',
 'Se reun√≠an a conversar sin tregua, a repetirse durante horas y horas los mismos chistes, a comp

In [132]:
rus_df.loc[128, "chunk"]

'–ö–æ–≥–¥–∞ –•–æ—Å–µ –ê—Ä–∫–∞–¥–∏–æ –ë—É—ç–Ω–¥–∏—è –ø–æ–Ω—è–ª, —á—Ç–æ –ø–æ–≤–µ—Ç—Ä–∏–µ —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω–∏–ª–æ—Å—å –ø–æ –≤—Å–µ–º—É –≥–æ—Ä–æ–¥–∫—É, –æ–Ω —Å–æ–±—Ä–∞–ª –æ—Ç—Ü–æ–≤ —Å–µ–º–µ–π—Å—Ç–≤ –∏ —Ä–∞—Å—Å–∫–∞–∑–∞–ª –∏–º –≤—Å–µ, —á—Ç–æ –∑–Ω–∞–ª –æ –±–µ—Å—Å–æ–Ω–Ω–æ–π –±–æ–ª–µ–∑–Ω–∏, –∏ –±—ã–ª–æ —Ä–µ—à–µ–Ω–æ –ø—Ä–∏–Ω—è—Ç—å –º–µ—Ä—ã, —á—Ç–æ–±—ã –ø–æ–º–µ—à–∞—Ç—å –∑–∞—Ä–∞–∑–µ –ø–µ—Ä–µ–∫–∏–Ω—É—Ç—å—Å—è –Ω–∞ —Å–æ—Å–µ–¥–Ω–∏–µ —Å–µ–ª–µ–Ω–∏—è.'

In [155]:
[rus_df.loc[128, "chunk"], chunks_es[563]]

['–ö–æ–≥–¥–∞ –•–æ—Å–µ –ê—Ä–∫–∞–¥–∏–æ –ë—É—ç–Ω–¥–∏—è –ø–æ–Ω—è–ª, —á—Ç–æ –ø–æ–≤–µ—Ç—Ä–∏–µ —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω–∏–ª–æ—Å—å –ø–æ –≤—Å–µ–º—É –≥–æ—Ä–æ–¥–∫—É, –æ–Ω —Å–æ–±—Ä–∞–ª –æ—Ç—Ü–æ–≤ —Å–µ–º–µ–π—Å—Ç–≤ –∏ —Ä–∞—Å—Å–∫–∞–∑–∞–ª –∏–º –≤—Å–µ, —á—Ç–æ –∑–Ω–∞–ª –æ –±–µ—Å—Å–æ–Ω–Ω–æ–π –±–æ–ª–µ–∑–Ω–∏, –∏ –±—ã–ª–æ —Ä–µ—à–µ–Ω–æ –ø—Ä–∏–Ω—è—Ç—å –º–µ—Ä—ã, —á—Ç–æ–±—ã –ø–æ–º–µ—à–∞—Ç—å –∑–∞—Ä–∞–∑–µ –ø–µ—Ä–µ–∫–∏–Ω—É—Ç—å—Å—è –Ω–∞ —Å–æ—Å–µ–¥–Ω–∏–µ —Å–µ–ª–µ–Ω–∏—è.',
 'Cuando Jos√© Arcadio Buend√≠a se dio cuenta de que la peste hab√≠a invadido el pueblo,']

In [None]:
pointer

In [154]:
rus_df['embedding'][128].index

<function list.index(value, start=0, stop=9223372036854775807, /)>

In [158]:
pointer = 563
for (i, ru_embed) in enumerate(rus_df['embedding'][128:], start=128):
    (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
    retries = 0
    while (score < 0.6) and (retries < 5):
        print(f"üîÅ Retracing for fragment {i} | pointer {pointer} |Ô∏è {score}")
        pointer -= 1
        (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
        retries += 1
    pointer += retries
    retries = 0
    while (score < 0.6) and (retries < 5):
        print(f"üîÅ Retracing for fragment {i} | pointer {pointer} |Ô∏è {score}")
        pointer += 1
        (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
        retries += 1
    if (score < 0.6):
        rus_df.to_pickle("./merge/rus_df_matched.pkl")
        break
    pointer = new_pointer
    rus_df.loc[i, "match"] = es_chunk
    if i % 10 == 0:
        rus_df.to_pickle("./merge/rus_df_matched.pkl")
        print(f"‚úÖ Data stored to file")
    print(f"‚úÖ fragment {i} |Ô∏è {score}")

‚úÖ fragment 128 |Ô∏è 0.6930626802641963
‚úÖ fragment 129 |Ô∏è 0.7282699385569497
üîÅ Retracing for fragment 130 | pointer 569 |Ô∏è 0.44161795026677586
‚úÖ Data stored to file
‚úÖ fragment 130 |Ô∏è 0.7641808259784947
‚úÖ fragment 131 |Ô∏è 0.7393764084549267
‚úÖ fragment 132 |Ô∏è 0.8678477350338707
‚úÖ fragment 133 |Ô∏è 0.6948443729361155
‚úÖ fragment 134 |Ô∏è 0.7533633242103599
‚úÖ fragment 135 |Ô∏è 0.7345502172055403
üîÅ Retracing for fragment 136 | pointer 589 |Ô∏è 0.5540566195515938
üîÅ Retracing for fragment 136 | pointer 588 |Ô∏è 0.497206988233643
üîÅ Retracing for fragment 136 | pointer 587 |Ô∏è 0.4612396626505195
üîÅ Retracing for fragment 136 | pointer 586 |Ô∏è 0.43541941298402626
üîÅ Retracing for fragment 136 | pointer 585 |Ô∏è 0.366250138413826
üîÅ Retracing for fragment 136 | pointer 589 |Ô∏è 0.3857825252446326
üîÅ Retracing for fragment 136 | pointer 590 |Ô∏è 0.53834614672186
üîÅ Retracing for fragment 136 | pointer 591 |Ô∏è 0.3075580660798289
üîÅ Retracing for f

In [163]:
rus_df.loc[136, "chunk"]

'–¢–æ–≥–¥–∞ –æ–Ω –ø—Ä–∏–ª–µ–ø–∏–ª –Ω–∞ –Ω–∏—Ö –Ω—É–∂–Ω—ã–µ –Ω–∞–∫–ª–µ–π–∫–∏, –∏ —Å—Ç–æ–∏–ª–æ —Ç–æ–ª—å–∫–æ –≤–∑–≥–ª—è–Ω—É–≥—å –Ω–∞ —è—Ä–ª—ã–∫, –∫–∞–∫ —Å—Ä–∞–∑—É –¥–µ–ª–∞–ª–æ—Å—å –ø–æ–Ω—è—Ç–Ω–æ, —á—Ç–æ —ç—Ç–æ –∑–∞ —à—Ç—É–∫–∞.'

In [165]:
chunks_es[589:591]

['Entonces las marc√≥ con el nombre respectivo,',
 'de modo que le bastaba con leer la inscripci√≥n para identificarlas .']

In [170]:
cosine_similarity(rus_df.loc[136, "embedding"], await get_embedding(" ".join(chunks_es[589:591])))

np.float64(0.5540566195515938)

In [188]:
async def start_from(pointer, ru_chunk_index):
    for (i, ru_embed) in enumerate(rus_df['embedding'][ru_chunk_index:], start=ru_chunk_index):
        (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
        retries = 0
        while (score < 0.6) and (retries < 5):
            print(f"üîÅ Retracing for fragment {i} | pointer {pointer} |Ô∏è {score}")
            pointer -= 1
            (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
            retries += 1
        pointer += retries
        retries = 0
        while (score < 0.6) and (retries < 5):
            print(f"üîÅ Retracing for fragment {i} | pointer {pointer} |Ô∏è {score}")
            pointer += 1
            (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
            retries += 1
        if (score < 0.6):
            rus_df.to_pickle("./merge/rus_df_matched.pkl")
            return pointer
        pointer = new_pointer
        rus_df.loc[i, "match"] = es_chunk
        if i % 10 == 0:
            rus_df.to_pickle("./merge/rus_df_matched.pkl")
            print(f"‚úÖ Data stored to file")
        print(f"‚úÖ fragment {i} |Ô∏è {score}")

In [179]:
rus_df.loc[136, "match"] = " ".join(chunks_es[589:591])

In [180]:
rus_df.loc[136, "match"]

'Entonces las marc√≥ con el nombre respectivo, de modo que le bastaba con leer la inscripci√≥n para identificarlas .'

In [182]:
[rus_df.loc[137, "chunk"], chunks_es[591]]

['–ö–æ–≥–¥–∞ –æ—Ç–µ—Ü —Å —Ç—Ä–µ–≤–æ–≥–æ–π —Å–∫–∞–∑–∞–ª –µ–º—É, —á—Ç–æ –∑–∞–±—ã–ª –ø–æ—á—Ç–∏ –≤—Å–µ, –¥–∞–∂–µ —Å–∞–º—ã–µ —Å–∏–ª—å–Ω—ã–µ –≤–ø–µ—á–∞—Ç–ª–µ–Ω–∏—è –¥–µ—Ç—Å—Ç–≤–∞, –ê—É—Ä–µ–ª–∏–∞–Ω–æ —Å–æ–æ–±—â–∏–ª –µ–º—É –æ —Å–≤–æ–µ–º –º–µ—Ç–æ–¥–µ, –∏ –•–æ—Å–µ –ê—Ä–∫–∞–¥–∏–æ –ë—É—ç–Ω–¥–∏—è —Å—Ç–∞–ª –Ω–∞–≤–µ—à–∏–≤–∞—Ç—å —è—Ä–ª—ã–∫–∏ –Ω–∞ –≤—Å–µ –¥–æ–º–∞—à–Ω–∏–µ –≤–µ—â–∏, –∞ –ø–æ—Ç–æ–º –≤–≤–µ–ª —ç—Ç—É –ø—Ä–∞–∫—Ç–∏–∫—É –∏ –≤–æ –≤—Å–µ–º –≥–æ—Ä–æ–¥–∫–µ.',
 'Cuando su padre le comunic√≥ su alarma por haber olvidado hasta los hechos m√°s impresionantes de su ni√±ez,']

In [184]:
pointer = 591

In [191]:
new_pointer = await start_from(591, 137)

‚úÖ fragment 137 |Ô∏è 0.8301769096299978
‚úÖ fragment 138 |Ô∏è 0.6725626270295503
üîÅ Retracing for fragment 139 | pointer 603 |Ô∏è 0.1667428216207925
üîÅ Retracing for fragment 139 | pointer 602 |Ô∏è 0.15794865924824872
üîÅ Retracing for fragment 139 | pointer 601 |Ô∏è 0.25359684721021236
üîÅ Retracing for fragment 139 | pointer 600 |Ô∏è 0.22418672706573123
üîÅ Retracing for fragment 139 | pointer 599 |Ô∏è 0.2555706660125172
üîÅ Retracing for fragment 139 | pointer 603 |Ô∏è 0.2675247307646622
üîÅ Retracing for fragment 139 | pointer 604 |Ô∏è 0.12015919138697674
üîÅ Retracing for fragment 139 | pointer 605 |Ô∏è 0.05846072897701642
üîÅ Retracing for fragment 139 | pointer 606 |Ô∏è 0.13907983781408062
‚úÖ fragment 139 |Ô∏è 0.6316412498424689
üîÅ Retracing for fragment 140 | pointer 611 |Ô∏è 0.29198690829683827
‚úÖ Data stored to file
‚úÖ fragment 140 |Ô∏è 0.7959809243475134
‚úÖ fragment 141 |Ô∏è 0.7591700049891225
‚úÖ fragment 142 |Ô∏è 0.8047103791646074
‚úÖ fragment 143 |Ô∏è 0

In [215]:
rus_df.loc[156, "chunk"]

'–û—Å–ª–µ–ø–∏—Ç–µ–ª—å–Ω–∞—è –≤—Å–ø—ã—à–∫–∞ —Ä–∞–¥–æ—Å—Ç–∏ —É–≤–ª–∞–∂–Ω–∏–ª–∞ –µ–º—É –≥–ª–∞–∑–∞ —Å–ª–µ–∑–∞–º–∏ —Ä–∞–Ω—å—à–µ, —á–µ–º –æ–Ω —É–≤–∏–¥–µ–ª —Å–µ–±—è –≤ –∏–¥–∏–æ—Ç—Å–∫–æ–π –∫–æ–º–Ω–∞—Ç–µ, –≥–¥–µ –≤—Å–µ –≤–µ—â–∏ –∏–º–µ–ª–∏ —è—Ä–ª—ã—á–∫–∏ —Å –Ω–∞–∑–≤–∞–Ω–∏—è–º–∏; —Ä–∞–Ω—å—à–µ, —á–µ–º –æ–Ω —É—Å—Ç—ã–¥–∏–ª—Å—è –≤—Å–µ—Ö –±–ª–∞–≥–æ–≥–ª—É–ø–æ—Å—Ç–µ–π, –Ω–∞—á–µ—Ä—Ç–∞–Ω–Ω—ã—Ö –Ω–∞ —Å—Ç–µ–Ω–∞—Ö, –∏ –¥–∞–∂–µ —Ä–∞–Ω—å—à–µ, —á–µ–º —É–∑–Ω–∞–ª –ø—Ä–∏—à–µ–¥—à–µ–≥–æ. –ê –ø—Ä–∏—à–µ–ª –ú–µ–ª—å–∫–∏–∞–¥–µ—Å.'

In [216]:
[c for c in enumerate(chunks_es[654:684])]

[(0, 'no con el olvido remediable del coraz√≥n,'),
 (1, 'sino con otro olvido m√°s cruel e irrevocable que √©l conoc√≠a muy bien,'),
 (2, 'porque era el olvido de la muerte .'),
 (3, 'Entonces comprendi√≥ .'),
 (4, 'Abri√≥ la maleta atiborrada de objetos indescifrables,'),
 (5, 'y de entre ellos sac√≥ un malet√≠n con muchos frascos .'),
 (6, 'Le dio a beber a Jos√© Arcadio Buend√≠a una sustancia de color apacible,'),
 (7, 'y la luz se hizo en su memoria .'),
 (8, 'Los ojos se le humedecieron de llanto,'),
 (9,
  'antes de verse a s√≠ rnismo en una sala absurda donde los objetos estaban marcados,'),
 (10,
  'y antes de avergonzarse de las solemnes tonter√≠as escritas en las paredes,'),
 (11,
  'y aun antes de reconocer al reci√©n llegado en un deslumbrante resplandor de alegr√≠a .'),
 (12, 'Era Melqu√≠ades .'),
 (13, 'Mientras Macondo celebraba la reconquista de los recuerdos,'),
 (14,
  'Jos√© Arcadio Buend√≠a y Melqu√≠ades le sacudieron el polvo a su vieja amistad .'),
 (15, 'EI gitan

In [218]:
[rus_df.loc[156, "chunk"], chunks_es[662]]

['–û—Å–ª–µ–ø–∏—Ç–µ–ª—å–Ω–∞—è –≤—Å–ø—ã—à–∫–∞ —Ä–∞–¥–æ—Å—Ç–∏ —É–≤–ª–∞–∂–Ω–∏–ª–∞ –µ–º—É –≥–ª–∞–∑–∞ —Å–ª–µ–∑–∞–º–∏ —Ä–∞–Ω—å—à–µ, —á–µ–º –æ–Ω —É–≤–∏–¥–µ–ª —Å–µ–±—è –≤ –∏–¥–∏–æ—Ç—Å–∫–æ–π –∫–æ–º–Ω–∞—Ç–µ, –≥–¥–µ –≤—Å–µ –≤–µ—â–∏ –∏–º–µ–ª–∏ —è—Ä–ª—ã—á–∫–∏ —Å –Ω–∞–∑–≤–∞–Ω–∏—è–º–∏; —Ä–∞–Ω—å—à–µ, —á–µ–º –æ–Ω —É—Å—Ç—ã–¥–∏–ª—Å—è –≤—Å–µ—Ö –±–ª–∞–≥–æ–≥–ª—É–ø–æ—Å—Ç–µ–π, –Ω–∞—á–µ—Ä—Ç–∞–Ω–Ω—ã—Ö –Ω–∞ —Å—Ç–µ–Ω–∞—Ö, –∏ –¥–∞–∂–µ —Ä–∞–Ω—å—à–µ, —á–µ–º —É–∑–Ω–∞–ª –ø—Ä–∏—à–µ–¥—à–µ–≥–æ. –ê –ø—Ä–∏—à–µ–ª –ú–µ–ª—å–∫–∏–∞–¥–µ—Å.',
 'Los ojos se le humedecieron de llanto,']

In [219]:
pointer = await start_from(662, 156)

‚úÖ fragment 156 |Ô∏è 0.84126885381759
‚úÖ fragment 157 |Ô∏è 0.8739175452208883
‚úÖ fragment 158 |Ô∏è 0.7138172942856245
‚úÖ fragment 159 |Ô∏è 0.7460890560489518
‚úÖ Data stored to file
‚úÖ fragment 160 |Ô∏è 0.7994870370463891
‚úÖ fragment 161 |Ô∏è 0.7868204318395056
‚úÖ fragment 162 |Ô∏è 0.6363919186608169
‚úÖ fragment 163 |Ô∏è 0.8504959677150539
‚úÖ fragment 164 |Ô∏è 0.7710934526607249
‚úÖ fragment 165 |Ô∏è 0.8188025506409532
‚úÖ fragment 166 |Ô∏è 0.7091167693915742
‚úÖ fragment 167 |Ô∏è 0.7137345840336297
‚úÖ fragment 168 |Ô∏è 0.8363806562396285
‚úÖ fragment 169 |Ô∏è 0.877046839664372
‚úÖ Data stored to file
‚úÖ fragment 170 |Ô∏è 0.7699487747638002
‚úÖ fragment 171 |Ô∏è 0.7350295990446368
‚úÖ fragment 172 |Ô∏è 0.8413517509722384
‚úÖ fragment 173 |Ô∏è 0.8186312329999182
‚úÖ fragment 174 |Ô∏è 0.7657661346913919
‚úÖ fragment 175 |Ô∏è 0.8024177944183715
‚úÖ fragment 176 |Ô∏è 0.7563025870696723
üîÅ Retracing for fragment 177 | pointer 736 |Ô∏è 0.5346732338505763
‚úÖ fragment 177 |Ô∏è 0.

In [220]:
rus_df.loc[218, "chunk"]

'‚Äî –í —ç—Ç–æ–º –≥–æ—Ä–æ–¥–µ –º—ã –æ–±—Ö–æ–¥–∏–º—Å—è –±–µ–∑ –±—É–º–∞–≥, ‚Äî —Å–∫–∞–∑–∞–ª –æ–Ω, –Ω–µ —Ç–µ—Ä—è—è –ø—Ä–∏—Å—É—Ç—Å—Ç–≤–∏—è –¥—É—Ö–∞. ‚Äî –ó–∞—Ä—É–±–∏—Ç–µ —Å–µ–±–µ –Ω–∞ –Ω–æ—Å—É: –Ω–∞–º –Ω–µ –Ω—É–∂–µ–Ω –Ω–∏–∫–∞–∫–æ–π —É–ø—Ä–∞–≤–∏—Ç–µ–ª—å, –º—ã —Å–∞–º–∏ –ø—Ä–µ–∫—Ä–∞—Å–Ω–æ –∑–¥–µ—Å—å —É–ø—Ä–∞–≤–ª—è–µ–º—Å—è.'

In [222]:
" ".join(chunks_es[934:937])

'‚Äî En este pueblo no mandamos con papeles ‚Äî dijo sin perder la calma ‚Äî . Y para que 10 sepa de una vez, no necesitamos ning√∫n corregidor porque aqu√≠ no hay nada que corregir .'

In [223]:
cosine_similarity(rus_df.loc[218, "embedding"], await get_embedding(" ".join(chunks_es[934:937])))

np.float64(0.5767228447939377)

In [232]:
rus_df.loc[218, "match"] = " ".join(chunks_es[934:937])

In [249]:
rus_df.loc[218, "chunk"] = backup_df.loc[218, "chunk"]

In [229]:
chunks_es[937:943]

['Ante la impavidez de don Apolinar Moscote,',
 'siempre sin levantar la voz,',
 'hizo un pormenorizado recuento de c√≥mo hab√≠an fundado la aldea,',
 'de c√≥mo se hab√≠an repartido la tierra,',
 'abierto los caminos e introducido las mejoras que les hab√≠a ido exigiendo la necesidad,',
 'sin haber molestado a gobierno alguno y sin que nadie los molestara .']

In [237]:
rus_df.loc[219, "match"] = " ".join(chunks_es[937:943])

In [251]:
rus_df.loc[219]

chunk          –ò –•–æ—Å–µ –ê—Ä–∫–∞–¥–∏–æ –ë—É—ç–Ω–¥–∏—è, –≥–ª—è–¥—è –Ω–∞ –Ω–µ–≤–æ–∑–º—É—Ç–∏–º–æ–≥–æ...
translation    Y Jos√© Arkadio Buendy, mirando el Muscovite Do...
embedding      [0.054191216826438904, 0.005699383094906807, 0...
match          Ante la impavidez de don Apolinar Moscote, sie...
Name: 219, dtype: object

In [241]:
backup_df = pd.read_pickle("./merge/rus_df.pkl")

In [252]:
rus_df.loc[220, "chunk"]

'¬´–ú—ã –∂–∏–≤–µ–º —Å—Ç–æ–ª—å —Ç–∏—Ö–æ –∏ –º–∏—Ä–Ω–æ, —á—Ç–æ –¥–∞–∂–µ —Å–º–µ—Ä—Ç—å –Ω–∞—Å —Å—Ç–æ—Ä–æ–Ω–æ–π –æ–±—Ö–æ–¥–∏—Ç, ‚Äî —Å–∫–∞–∑–∞–ª –æ–Ω. ‚Äî –≤—ã —Å–∞–º–∏ –≤–∏–¥–∏—Ç–µ, —á—Ç–æ —Ç—É—Ç –Ω–µ—Ç –∫–ª–∞–¥–±–∏—â–∞¬ª. –ù–∞ –ø—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ –æ–Ω –æ–±–∏–¥—ã –Ω–µ –¥–µ—Ä–∂–∏—Ç –∑–∞ —Ç–æ, —á—Ç–æ –∏–º –Ω–µ –ø–æ–º–æ–≥–ª–∏.'

In [254]:
chunks_es[943:953]

['¬´ Somos tan pac√≠ficos que ni siquiera nos hemos muerto de muerte natural ¬ª,',
 'dijo .',
 '¬´ Ya ve que todav√≠a no tenernos cementerio .',
 '¬ª No se doli√≥ de que el gobierno no los hubiera ayudado .',
 'Al contrario,',
 'se alegraba de que hasta entonces los hubiera dejado crecer en paz,',
 'y esperaba que as√≠ los siguiera dejando,',
 'porque ellos no hab√≠an fundado un pueblo para que el primer advenedizo les fuera a decir 10 que deb√≠an hacer .',
 'Don Apolinar Moscote se hab√≠a puesto un saco de dril,',
 'blanco como sus pantalones,']

In [255]:
pointer = await start_from(943, 220)

‚úÖ Data stored to file
‚úÖ fragment 220 |Ô∏è 0.8316591321946442
üîÅ Retracing for fragment 221 | pointer 947 |Ô∏è 0.43276087748417413
üîÅ Retracing for fragment 221 | pointer 946 |Ô∏è 0.4526499407240603
üîÅ Retracing for fragment 221 | pointer 945 |Ô∏è 0.4492925332578794
üîÅ Retracing for fragment 221 | pointer 944 |Ô∏è 0.462957299060328
üîÅ Retracing for fragment 221 | pointer 943 |Ô∏è 0.4380571279731144
üîÅ Retracing for fragment 221 | pointer 947 |Ô∏è 0.3832808232319134
üîÅ Retracing for fragment 221 | pointer 948 |Ô∏è 0.4198677839811001
üîÅ Retracing for fragment 221 | pointer 949 |Ô∏è 0.45429494401236886
üîÅ Retracing for fragment 221 | pointer 950 |Ô∏è 0.36757211517546723
üîÅ Retracing for fragment 221 | pointer 951 |Ô∏è 0.22127622875664194


In [256]:
rus_df.loc[221, "chunk"]

'–ù–∞–ø—Ä–æ—Ç–∏–≤, –æ—á–µ–Ω—å –¥–æ–≤–æ–ª–µ–Ω —Ç–µ–º, —á—Ç–æ –¥–æ —Å–∏—Ö –ø–æ—Ä –Ω–∏–∫—Ç–æ –Ω–µ –≤—Å—Ç–∞–≤–ª—è–ª –ø–∞–ª–∫–∏ –≤ –∫–æ–ª–µ—Å–∞ –∏ —á—Ç–æ –≤–ø—Ä–µ–¥—å —Ç–∞–∫ –æ–Ω–æ –∏ –±—É–¥–µ—Ç, –ø–æ—Ç–æ–º—É –∫–∞–∫ –æ–Ω–∏ –æ—Å–Ω–æ–≤–∞–ª–∏ –≥–æ—Ä–æ–¥ –Ω–µ –¥–ª—è —Ç–æ–≥–æ, —á—Ç–æ–±—ã –ø–µ—Ä–≤—ã–π –ø—Ä–∏—à–ª—ã–π —É–∫–∞–∑—ã–≤–∞–ª –∏–º, –∑–∞ –∫–∞–∫–∏–µ –±—Ä–∞—Ç—å—Å—è –¥–µ–ª–∞.'

In [259]:
" ".join(chunks_es[947:951])

'Al contrario, se alegraba de que hasta entonces los hubiera dejado crecer en paz, y esperaba que as√≠ los siguiera dejando, porque ellos no hab√≠an fundado un pueblo para que el primer advenedizo les fuera a decir 10 que deb√≠an hacer .'

In [260]:
cosine_similarity(rus_df.loc[221, "embedding"], await get_embedding(" ".join(chunks_es[947:951])))

np.float64(0.46224171513347845)

In [261]:
[rus_df.loc[221, "translation"], " ".join(chunks_es[947:951])]

['Por el contrario, estoy muy contento de que hasta ahora nadie haya insertado palos en las ruedas y que contin√∫e as√≠, porque encontraron la ciudad no para que el primer peri√≥dico les mostrara qu√© tipo de asuntos asumir.',
 'Al contrario, se alegraba de que hasta entonces los hubiera dejado crecer en paz, y esperaba que as√≠ los siguiera dejando, porque ellos no hab√≠an fundado un pueblo para que el primer advenedizo les fuera a decir 10 que deb√≠an hacer .']

In [262]:
rus_df.loc[221, "match"] = " ".join(chunks_es[947:951])

In [266]:
rus_df.loc[222, "chunk"]

'–î–æ–Ω –ê–ø–æ–ª–∏–Ω–∞—Ä –ú–æ—Å–∫–æ—Ç–µ –∞–∫–∫—É—Ä–∞—Ç–Ω–æ –≤–ª–µ–∑ –≤ —Ä—É–∫–∞–≤–∞ –ø–æ–ª–æ—Ç–Ω—è–Ω–æ–≥–æ –∫–∏—Ç–µ–ª—è, –±–µ–ª–æ–≥–æ, –∫–∞–∫ –∏ –±—Ä—é–∫–∏, –∏ —Ç—â–∞—Ç–µ–ª—å–Ω–æ –∑–∞—Å—Ç–µ–≥–Ω—É–ª—Å—è. ‚Äî –¢–∞–∫ —á—Ç–æ, –µ—Å–ª–∏ –≤—ã –∂–µ–ª–∞–µ—Ç–µ –∑–¥–µ—Å—å –æ—Å—Ç–∞—Ç—å—Å—è –∏ –∂–∏—Ç—å, –∫–∞–∫ –∂–∏–≤—É—Ç –æ–±—ã—á–Ω—ã–µ –Ω–æ—Ä–º–∞–ª—å–Ω—ã–µ –ª—é–¥–∏, –º–∏–ª–æ—Å—Ç–∏ –ø—Ä–æ—Å–∏–º, ‚Äî –∑–∞–∫–æ–Ω—á–∏–ª –•–æ—Å–µ –ê—Ä–∫–∞–¥–∏–æ –ë—É—ç–Ω–¥–∏—è.'

In [268]:
pointer = await start_from(951, 222)

‚úÖ fragment 222 |Ô∏è 0.7374092794010507
‚úÖ fragment 223 |Ô∏è 0.8353431279319482
‚úÖ fragment 224 |Ô∏è 0.8376013208406766
‚úÖ fragment 225 |Ô∏è 0.6612123398270768
‚úÖ fragment 226 |Ô∏è 0.7399253944601267
‚úÖ fragment 227 |Ô∏è 0.6126009909791045
‚úÖ fragment 228 |Ô∏è 0.6784404700813653
‚úÖ fragment 229 |Ô∏è 0.7894329062859442
‚úÖ Data stored to file
‚úÖ fragment 230 |Ô∏è 0.8983466511375552
‚úÖ fragment 231 |Ô∏è 0.7170314292878094
‚úÖ fragment 232 |Ô∏è 0.8156426599580765
üîÅ Retracing for fragment 233 | pointer 1009 |Ô∏è 0.39803090835436683
üîÅ Retracing for fragment 233 | pointer 1008 |Ô∏è 0.44738740810456845
üîÅ Retracing for fragment 233 | pointer 1007 |Ô∏è 0.45715926962550885
üîÅ Retracing for fragment 233 | pointer 1006 |Ô∏è 0.42388780386512026
üîÅ Retracing for fragment 233 | pointer 1005 |Ô∏è 0.4683768214450446
üîÅ Retracing for fragment 233 | pointer 1009 |Ô∏è 0.4820842897003066
‚úÖ fragment 233 |Ô∏è 0.773265037114969
‚úÖ fragment 234 |Ô∏è 0.8160662713995487
‚úÖ fragment 2

In [271]:
rus_df.shape

(2919, 4)