Load openai and transliterate the lyrics to romlish.

In [5]:
import os  
from tqdm import tqdm
from openai import AzureOpenAI  

from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Retrieve credentials from environment variables
api_key = os.getenv("AZURE_OPENAI_KEY")
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
deployment_id = "gpt-4o"
api_version = "2025-01-01-preview"

# Initialize Azure OpenAI Service client with key-based authentication    
client = AzureOpenAI(  
    azure_endpoint=endpoint,  
    api_key=api_key,  
    api_version=api_version,
)
    
    
# IMAGE_PATH = "YOUR_IMAGE_PATH"
# encoded_image = base64.b64encode(open(IMAGE_PATH, 'rb').read()).decode('ascii')

INSTRUCTIONS = """
Task:
Perform romanization of Arabic or Hebrew song lyrics.
Convert Arabic or Hebrew song lyrics into a phonetic transliteration using only the english alphabet, an apostrophe ('), spaces, and new lines. 
The goal is to preserve the original phonology as closely as possible while using only the english writing system.

Guidelines:
No numbers – Avoid using numbers like "2," "3," or "7" for Arabic or Hebrew sounds. Instead, use the closest english letters.
Preserve pronunciation – The transliteration should reflect how the lyrics are sung, maintaining the phonetic structure of the original language.
Use apostrophes ('') – Represent sounds like:
Arabic: ع (ʿayn) and ء (hamza) → '
Hebrew: ע (ʿayin) and א (aleph) → ' when pronounced
Words must start with a letter – Do not start words with apostrophes. Use the closest english letter instead.
No extra characters – Do not use diacritics, underscores, or special symbols beyond apostrophes.
Keep word boundaries – Maintain spacing similar to the original lyrics.
Use lowercase letters for the transliteration.

Examples (Arabic):
Input:
حبيبي يا نور العين يا ساكن خيالي
عاشق بقالي سنين ولا غيرك في بالي
أجمل عيون فى الكون أنا شفتها
الله عليك، الله على سحرها

Output:
habibi ya nour el-ain, ya sakhen khayali
aashiq ba'ali seneen wala ghayrak fi bali
ajmal a'yun fil kawn ana shuftuha
allah aalayk, allah ala sahriha

Input:
مسيطرة، همشيك مسطرة
هخليك لو شوفت في شارع بنت تبص لورا
أيوه أنا مسيطرة، يا حتة سكرة
طول ما أنت معايا تمشي على هوايا أنا متكبرة

Output:
mesytara, hamshek mastara
hakhlek law shoft fi share'a bent tebos le-wara
aywa ana mesytara ya hetet sokra
toul ma enta ma’aya temshi aala hawaya, ana motakabera

Input:
مافيش حاجه تيجي كده
إهدا حبيبي كده وإرجع زي زمان
يابني اسمعني هتدلعني
تاخد عيني كمان

Output:
ma feesh haga teggy keda
ihda habiby keda, w ergaa' zay zaman
ya ebni isma'ni, hatdala'any
takhod einy kaman

Examples (Hebrew):
Input:
טמפרטורה הזיה, השמש עושה לה טוב
אמא שלה, ממרוקו
אבא מצפון הים, אפשר להרגיש את הקור
משאללה, איך יצא לו טוב

Output:
temperatura haziya, hashemesh osa la tov
ima shela, me'marocco
aba mitzafon hayam, efshar lehargish ta'kor
mashallah, eich yatz’a lo tov

Input:
שמור וזכור בדיבור אחד
השמיענו אל המיוחד
אדוני אחד ושמו אחד
לשם ולתפארת ולתהילה
לכה דודי לקראת כלה
פני שבת נקבלה

Output:
shamor v’zachor b’dibur echad
hishmi’anu el ha’myuchad
adonai echad u’shmo echad
l’shem u’l’tiferet v’lit’hillah
lecha dodi likrat kallah
p'nei shabbat n'kabela

Input:
היא מרגישה שנפתח לה המזל
פגשה אחד, גבר-גבר ורג'אל
והיא תלחש לו, מה היא תלחש לו-
"קח אותי על הגמל"

Output:
hi margisha sheniftach la hamazal
pagsha echad, gever gever verajal
vehi tilchash lo, ma hi tilchash lo
"kach oti al hagamal"

I repeat, do not start words with apostrophes. If a word starts with ع or ע, use the closest english letter.
For example: 
Input:
عينيك
Output:
ainayk

Input:
עיניים
Output:
ayinayim

Input:
على
Output:
ala

Input:
על
Output:
al

Input:
عنب
Output:
anab

Input:
عليك
Output:
alayk

Input:
عليه
Output:
alayh
"""

#Prepare the chat prompt 
basic_prompt = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": INSTRUCTIONS
            }
        ]
    }
] 

In [6]:
# Define the path to the lyrics folder
lyrics_folder = 'lyrics_src'
rom_lyrics_folder = 'lyrics_rom'

# Create the destination folder if it does not exist
if not os.path.exists(rom_lyrics_folder):
    os.makedirs(rom_lyrics_folder)

# Iterate over each file in the lyrics folder
for filename in tqdm(os.listdir(lyrics_folder)):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        # Construct the full file path
        file_path = os.path.join(lyrics_folder, filename)
        
        # Read the content of the file
        with open(file_path, 'r') as file:
            content = file.read()
        
        # Create a 'user' message with the lyrics as content
        user_message = {
            "role": "user",
            "content": content
        }
        
        # Aggregate the message to the messages list
        messages = basic_prompt + [user_message]

        # Generate the completion  
        completion = client.chat.completions.create(  
            model=deployment_id,
            messages=messages,
            max_tokens=800,  
            temperature=0.7,  
            top_p=0.95,  
            frequency_penalty=0,  
            presence_penalty=0,
            stop=None,  
            stream=False
        )

        # Extract the response from the completion
        response = completion.choices[0].message.content

        # Construct the destination file path
        dst_file_path = os.path.join(rom_lyrics_folder, filename)

        # Write the response to the destination file
        with open(dst_file_path, 'w') as file:
            file.write(response)


100%|██████████| 8/8 [01:38<00:00, 12.32s/it]


In [7]:
import os
import string
dst_lyrics_folder = 'lyrics'

# Create the destination folder if it does not exist
if not os.path.exists(dst_lyrics_folder):
    os.makedirs(dst_lyrics_folder)

# Iterate over each file in the lyrics folder and process the content
for filename in os.listdir(rom_lyrics_folder):
    # fetch the transcript from the completion
    with open(os.path.join(rom_lyrics_folder, filename), 'r') as file:
        content = file.read()
    
    # Convert content to lowercase and filter only ASCII lowercase letters, apostrophes, or spaces, keeping newlines
    filtered_content = ''.join([char if char in string.ascii_lowercase + " '\n" else '' for char in content.lower()])

    # Remove apostrophes at the beginning of words or after newlines
    filtered_content = '\n'.join([' '.join([word[1:] if word.startswith("'") else word for word in line.split()]) for line in filtered_content.split('\n')])

    # Create the .txt filename
    file_path = os.path.join(dst_lyrics_folder, filename)

    with open(file_path, 'w') as output_file:
        output_file.write(filtered_content)
    
    # Split the filtered content by spaces and newlines, then join with newlines
    words = filtered_content.split()
    words_content = '\n'.join(words)
    
    # Create the .words.txt filename
    words_file_path = os.path.join(dst_lyrics_folder, filename.replace('.txt', '.words.txt'))
    
    # Write the processed content to the new file
    with open(words_file_path, 'w') as words_file:
        words_file.write(words_content)