# Semantic alignement with OpenAI text-embedding-3-small model

## Step 1 - Build a dataset of native edition chunks

In [None]:
import spacy
nlp_ru = spacy.load("ru_core_news_lg")
def split_into_chunks(text: str, max_chars: int = 300) -> list:
    doc = nlp_ru(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    chunks = []
    current_chunk = ""
    for sent in sentences:
        if len(current_chunk) + len(sent) <= max_chars:
            current_chunk += (" " if current_chunk else "") + sent
        else:
            chunks.append(current_chunk)
            current_chunk = sent
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [None]:
from pathlib import Path
def load_text_from_file(filepath: str) -> str:
    return Path(filepath).read_text(encoding='utf-8')
text_ru = load_text_from_file("text_ru.txt")
chunks_ru = split_into_chunks(text_ru.replace("\n", " "), max_chars=300)

In [None]:
from IPython.display import display
display(chunks_ru[:3])

In [None]:
len(chunks_ru)

## Step 2 - Build a dataset of target edition chunks

In [None]:
import es_core_news_sm
nlp_es = es_core_news_sm.load()

text="""
Lo envió a las autoridades acompañado de numerosos testimonios sobre sus experiencias
y de varios pliegos de dibujos explicativos, al cuidado de un mensajero que atravesó la sierra,
se extravió en pantanos desmesurados, remontó ríos tormentosos y estuvo a punto de perecer bajo el azote de las fieras,
la desesperación y la peste, antes de conseguir una ruta de enlace con las mulas del correo.
"""
doc = nlp_es(text)
doc

In [None]:
from more_itertools import split_at
def break_long_sentences(doc):
    sublists = list(
        " ".join(line) for line in split_at(
            [d.text for d in doc],
            lambda x: x == ",")
    )
    return [
        chunk + ',' if i < len(sublists) - 1
        else chunk for i, chunk in enumerate(sublists)
    ]

In [None]:
break_long_sentences(doc)

In [None]:
from itertools import chain
def get_es_chunks(text: str) -> list:
    doc = nlp_es(text)

    chunks = []
    current_chunk = ""
    parts = [break_long_sentences(s) for s in doc.sents]
    return list(chain.from_iterable(parts))

In [None]:
text_es = load_text_from_file("text_es.txt")
chunks_es = get_es_chunks(text_es.replace("\n", " "))

In [None]:
len(chunks_es)

In [None]:
chunks_es[:10]

## Step 3 - Embed and match

In [None]:
from dotenv import load_dotenv
load_dotenv()
from openai import AsyncOpenAI
client = AsyncOpenAI(max_retries=5)

In [None]:
from typing import List
async def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = await client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding

In [None]:
embedding = await get_embedding(chunks_ru[0])
len(embedding)

In [None]:
embedding[:10]

In [None]:
import numpy as np
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
display([chunks_ru[0], " ".join(chunks_es[0:3])])

In [None]:
cosine_similarity(
    await get_embedding(chunks_ru[0]),
    await get_embedding(" ".join(chunks_es[0:3]))
)

In [None]:
from googletrans import Translator

translator = Translator()
async def translate_russian_to_spanish(text: str) -> str:
    result = await translator.translate(text, src="ru", dest="es")
    return result.text

In [None]:
cosine_similarity(await get_embedding(
    await translate_russian_to_spanish(chunks_ru[0])
),
    await get_embedding(" ".join(chunks_es[0:3]))
)

In [None]:
cosine_similarity(await get_embedding(
    await translate_russian_to_spanish(chunks_ru[1])),
    await get_embedding(" ".join(chunks_es[3:4]))
)

In [None]:
cosine_similarity(await get_embedding(
    await translate_russian_to_spanish(chunks_ru[1])),
    await get_embedding(" ".join(chunks_es[3:5]))
)

In [None]:
cosine_similarity(await get_embedding(
    await translate_russian_to_spanish(chunks_ru[1])),
    await get_embedding(" ".join(chunks_es[3:6]))
)

In [None]:
display([chunks_ru[1], chunks_es[3:5]])

## Create dataframe

In [58]:
import pandas as pd
ru_df = pd.DataFrame({ "chunk": chunks_ru })

In [59]:
ru_df.shape

(51, 1)

## Add column for translation

In [60]:
import asyncio
import nest_asyncio
nest_asyncio.apply()
from concurrent.futures import ThreadPoolExecutor
from tqdm.asyncio import tqdm_asyncio
# Wrapper to run async function in thread-friendly context
async def translate_all(chunks, max_workers=10):
    semaphore = asyncio.Semaphore(max_workers)
    async def run_chunk(text):
        async with semaphore:
            return await translate_russian_to_spanish(text)

    # Run in batches
    tasks = [run_chunk(text) for text in chunks]
    return await tqdm_asyncio.gather(*tasks)

In [61]:
# Run translation asynchronously and assign to new column
translations = asyncio.run(translate_all(ru_df["chunk"].tolist(), max_workers=20))
ru_df["translation"] = translations

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:02<00:00, 17.05it/s]


In [62]:
ru_df.shape

(51, 2)

In [63]:
ru_df.tail(3)

Unnamed: 0,chunk,translation
48,"Примитивная лаборатория располагала, не считая...","El laboratorio primitivo tenía, sin contar muc..."
49,"Кроме всего прочего, Мелькиадес дал образцы се...","Entre otras cosas, Melkiades dio muestras de s..."
50,Соблазненный простотой формулы получения золот...,Seducido por la simplicidad de la fórmula para...


In [64]:
ru_df.to_pickle("ru_df_demo.pkl")

## Add embeddings

In [65]:
# Wrapper to run async function in thread-friendly context
async def embed_all(chunks, max_workers=10):
    semaphore = asyncio.Semaphore(max_workers)
    async def run_chunk(text):
        async with semaphore:
            return await get_embedding(text)

    # Run in batches
    tasks = [run_chunk(text) for text in chunks]
    return await tqdm_asyncio.gather(*tasks)

In [66]:
subset = ru_df["translation"].tolist()
embeddings = asyncio.run(embed_all(subset, max_workers=20))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:04<00:00, 12.09it/s]


In [67]:
ru_df["embedding"] = embeddings

In [68]:
ru_df.shape

(51, 3)

In [69]:
ru_df.head(1)

Unnamed: 0,chunk,translation,embedding
0,"Много лет спустя, перед самым расстрелом, полк...","Muchos años después, justo antes del tiroteo, ...","[0.042346835136413574, -0.001678806496784091, ..."


In [70]:
ru_df.to_pickle("ru_df_demo.pkl")

In [71]:
async def get_matching_fragment(embed, pointer):
    i=pointer+1
    last_score = -1
    while 1:
        score = cosine_similarity(
            embed,
            await get_embedding(" ".join(chunks_es[pointer:i]))
        )
        if (score < last_score):
            break
        last_score = score
        i += 1
    return (" ".join(chunks_es[pointer:i-1]), i-1, last_score)

In [74]:
async def start_from(pointer, ru_chunk_index):
    for (i, ru_embed) in enumerate(ru_df['embedding'][ru_chunk_index:], start=ru_chunk_index):
        (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
        retries = 0
        while (score < 0.6) and (retries < 5):
            print(f"🔁 Retracing for fragment {i} | pointer {pointer} |️ {score}")
            pointer -= 1
            (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
            retries += 1
        pointer += retries
        retries = 0
        while (score < 0.6) and (retries < 5):
            print(f"🔁 Retracing for fragment {i} | pointer {pointer} |️ {score}")
            pointer += 1
            (es_chunk, new_pointer, score) = await get_matching_fragment(ru_embed, pointer)
            retries += 1
        if (score < 0.6):
            ru_df.to_pickle("./merge/ru_df_matched.pkl")
            return pointer
        pointer = new_pointer
        ru_df.loc[i, "match"] = es_chunk
        if i % 10 == 0:
            ru_df.to_pickle("./merge/ru_df_matched.pkl")
            print(f"✅ Data stored to file")
        print(f"✅ fragment {i} |️ {score}")

In [99]:
pointer = await start_from(0, 0)

✅ Data stored to file
✅ fragment 0 |️ 0.8654028001839136
✅ fragment 1 |️ 0.9073097628761001
✅ fragment 2 |️ 0.7876713105387106
✅ fragment 3 |️ 0.8038073793088721
✅ fragment 4 |️ 0.7615130700959085
🔁 Retracing for fragment 5 | pointer 25 |️ 0.5709271842365248
✅ fragment 5 |️ 0.7094296078239161
✅ fragment 6 |️ 0.8352721363690172
✅ fragment 7 |️ 0.7991177630952013
✅ fragment 8 |️ 0.8790443396951402
✅ fragment 9 |️ 0.7123439494395828
✅ Data stored to file
✅ fragment 10 |️ 0.8143818241389978
✅ fragment 11 |️ 0.8232202157218353
✅ fragment 12 |️ 0.7488201304242075
✅ fragment 13 |️ 0.7564914884271897
✅ fragment 14 |️ 0.7398454437911969
✅ fragment 15 |️ 0.7735542532823528
✅ fragment 16 |️ 0.7252885476709303
✅ fragment 17 |️ 0.8495790795365834
✅ fragment 18 |️ 0.8422336902577091
✅ fragment 19 |️ 0.8343837142285713
✅ Data stored to file
✅ fragment 20 |️ 0.7498534709395513
✅ fragment 21 |️ 0.6668367141652842
✅ fragment 22 |️ 0.7588848571155656
✅ fragment 23 |️ 0.7315526070338915
✅ fragment 24 |️ 0

In [None]:
ru_df[0:1]

In [None]:
ru_df.iloc[15:30][["chunk", "match"]].values.tolist()

## Converting to audio-book

In [131]:
from google.cloud import texttospeech

client = texttospeech.TextToSpeechClient(
    client_options={"quota_project_id": "dual-lingua"}
)

In [132]:
def list_to_smm(matches):
    result = '<speak>'
    for match in matches:
        result += f'<voice name="ru-RU-Wavenet-D">{match[0]}</voice>'
        result += f'<voice name="es-ES-Standard-B">{match[1]}</voice>'
    result += '</speak>'
    return result

In [133]:
matches = ru_df.iloc[15:18][["chunk", "match"]].values.tolist()
smm = list_to_smm(matches)

In [134]:
smm

'<speak><voice name="ru-RU-Wavenet-D">Хосе Аркадио Буэндия, который не мог успокоиться после провала своей затеи с магнитами, тотчас сообразил, что это стекло можно использовать как боевое оружие. Мелькиадес снова попытался отговорить его.</voice><voice name="es-ES-Standard-B">José Arcadio Buendía, que aún no acababa de consolarse por el fracaso de sus imanes, concibió la idea de utilizar aquel invento como un arma de guerra . Melquíades, otra vez, trató de disuadirlo .</voice><voice name="ru-RU-Wavenet-D">Но в конечном счете цыган согласился отдать ему лупу в обмен на два магнита и три золотые колониальные монеты. Урсула рыдала от горя.</voice><voice name="es-ES-Standard-B">Pero terrninó por aceptar los dos lingotes imantados y tres piezas de dinero colonial a cambio de la lupa . Úrsula lloró de consternación .</voice><voice name="ru-RU-Wavenet-D">Эти деньги пришлось вытаскивать из сундучка с золотыми дублонами, которые ее отец копил всю свою жизнь, отказывая себе в лишнем куске, и ко

In [135]:
def smm_to_audio(ssml_text, output):
    synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
    
    # Choose a neutral voice (let Google pick based on lang tags)
    voice = texttospeech.VoiceSelectionParams(
        language_code="es-ES",  # This is just a default, SSML will override per segment
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
    )
    
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )
    
    response = client.synthesize_speech(
        input=synthesis_input,
        voice=voice,
        audio_config=audio_config
    )
    
    with open(output, "wb") as out:
        out.write(response.audio_content)
        print("Audio content written to " + output)

In [136]:
smm_to_audio(smm, 'story.mp3')

Audio content written to story.mp3
