In [17]:
import re
import subprocess


# Path to the dataset
file_path = "/scratch2/bsow/Documents/ACSR/data/claire_dialogue/train.txt"

# Function to clean the text
def clean_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    text = re.sub(r"\[.*?\]\s*", "", text)
    lines = [line.strip() for line in text.splitlines() if line.strip()]

    cleaned_text = []
    current_sentence = ""

    for line in lines:
        line = line.lower()
        cleaned_text.append(line)

    if current_sentence:
        cleaned_text.append(current_sentence.strip())

    return cleaned_text

# Function to remove punctuation from a list of sentences
def remove_punctuation(sentences):
    punctuation_pattern = re.compile(r"[^\w\s'-]")  
    cleaned_sentences = []

    for sentence in sentences:
        # Remove punctuation using the regex pattern
        cleaned_sentence = re.sub(punctuation_pattern, "", sentence)
        cleaned_sentences.append(cleaned_sentence.strip())

    return cleaned_sentences

# Function to convert text to IPA using espeak-ng
def text_to_ipa(text, language="fr"):
    """
    Convert text to IPA using espeak-ng.
    """
    # Remove special characters
    text = text.replace("?", "").replace("!", "").replace(".", "").replace(",", "").replace(":", "").replace(";", "").replace("'", "").replace("-", " ")

    command = ["espeak-ng", "-v", language, "-q", "--ipa"]
    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate(input=text.encode())
    ipa_output = stdout.decode().strip()
    ipa_output = ipa_output.replace("ˈ", "").replace("ˌ", "").replace("-", "").replace("\n", " ")

    return ipa_output

# Function to syllabify IPA text
def syllabify_ipa(ipa_text):
    consonants = "ptkbdgmnlrsfvzʃʒɡʁjwŋtrɥgʀycɲ"
    vowels = "aeɛioɔuøœəɑ̃ɛ̃ɔ̃œ̃ɑ̃ɔ̃ɑ̃ɔ̃"
    phonemes = list(ipa_text.replace(" ", ""))
    syllables = []
    i = 0

    while i < len(phonemes):
        phone = phonemes[i]
        if phone in vowels:
            # Check if the next character is a combining diacritic
            if i + 1 < len(phonemes) and phonemes[i + 1] == "̃":  # Corrected: No space after tilde
                syllable = phone + phonemes[i + 1]  # Combine base character with diacritic
                syllables.append(syllable)
                i += 2  # Skip the diacritic in the next iteration
            else:
                syllables.append(phone)
                i += 1
        elif phone in consonants:
            # Check if there is a next phone
            if i + 1 < len(phonemes):
                next_phone = phonemes[i + 1]
                if next_phone in vowels:
                    # Check if the vowel has a combining diacritic
                    if i + 2 < len(phonemes) and phonemes[i + 2] == "̃":  # Corrected: No space after tilde
                        syllable = phone + next_phone + phonemes[i + 2]  # Combine consonant, vowel, and diacritic
                        syllables.append(syllable)
                        i += 3  # Skip the diacritic in the next iteration
                    else:
                        syllable = phone + next_phone
                        syllables.append(syllable)
                        i += 2
                else:
                    syllables.append(phone)
                    i += 1
            else:
                syllables.append(phone)
                i += 1
        else:
            i += 1

    return syllables

In [18]:
import subprocess

def text_to_ipa(text, language="fr"):
    """
    Convert text to IPA using espeak-ng.
    """
    # Remove special characters
    text = text.replace("?", "").replace("!", "").replace(".", "").replace(",", "").replace(":", "").replace(";", "").replace("'", "").replace("-", " ")

    command = ["espeak-ng", "-v", language, "-q", "--ipa"]
    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate(input=text.encode())
    ipa_output = stdout.decode().strip()
    ipa_output = ipa_output.replace("ˈ", "").replace("ˌ", "").replace("-", "").replace("\n", " ")

    return ipa_output

def save_ipa_to_file(ipa_text, filename):
    """
    Save IPA text to a file with UTF-8 encoding.
    """
    ipa_text = syllabify_ipa(ipa_text)
    print("syllables: ", ipa_text)
    with open(filename, "w", encoding="utf-8") as file:
        file.write(" ".join(ipa_text))

# Example usage
ipa_text = text_to_ipa("j'entends")
print(ipa_text)  # Output: ʒɑ̃tɑ̃
save_ipa_to_file(ipa_text, "output.txt") 

ʒɑ̃tɑ̃
syllables:  ['ʒɑ̃', 'tɑ̃']


In [25]:
ipa_to_target = {
    # Vowels
    "a": "a", "ɑ": "a", "ə": "e", "ɛ": "e", "ø": "e^", "œ": "e^", "i": "i", "y": "y", 
    "u": "u", "o": "o", "ɔ": "o^", "ɑ̃": "a~", "ɛ̃": "e~", "ɔ̃": "o~",

    # Consonants
    "b": "b", "c": "k", "d": "d", "f": "f", "ɡ": "g", "j": "j", "k": "k", "l": "l", 
    "m": "m", "n": "n", "p": "p", "s": "s", "t": "t", "v": "v", "w": "w", "z": "z", 
    "ɥ": "w", "ʁ": "r", "ʃ": "s^", "ʒ": "z^", "ɲ": "gn", 
}

import re

def convert_ipa_to_syllables(ipa_text, ipa_to_target):
    # Step 1: Convert IPA phonemes to target phonemes
    converted_text = []
    i = 0
    while i < len(ipa_text):
        char = ipa_text[i]
        # Check if the next character is a combining diacritic
        if i + 1 < len(ipa_text) and ipa_text[i + 1] == "̃":
            # Combine the base character with the diacritic
            combined_char = char + ipa_text[i + 1]
            # Map the combined character if it exists in the dictionary
            mapped_char = ipa_to_target.get(combined_char, combined_char)
            converted_text.append(mapped_char)
            i += 2  # Skip the diacritic in the next iteration
        else:
            # Map the single character
            mapped_char = ipa_to_target.get(char, char)
            converted_text.append(mapped_char)
            i += 1
    return "".join(converted_text)

# Example usage
ipa_text = "bɔ̃"
syllables = " ".join(syllabify_ipa(ipa_text))
print("old syllables: ", syllables)
new_syllables = convert_ipa_to_syllables(syllables, ipa_to_target)
print("new syllables: ", new_syllables)  # Output: ['ma', 's^e', 'miz', 'e', 'ru', 'si']



old syllables:  bɔ̃
new syllables:  bo~


ma


In [None]:
import multiprocessing

# Function to convert text to IPA using espeak-ng (modified for multiprocessing)
def text_to_ipa_worker(sentence):
    """
    Worker function for multiprocessing to convert text to IPA.
    """
    return text_to_ipa(sentence)

# Function to syllabify IPA text (modified for multiprocessing)
def syllabify_ipa_worker(ipa_sentence):
    """
    Worker function for multiprocessing to syllabify IPA text.
    """
    syllables = syllabify_ipa(ipa_sentence)
    new_syllables = convert_ipa_to_syllables(" ".join(syllables), ipa_to_target)
    return new_syllables

# load ipa sentences
ipa_path = "/scratch2/bsow/Documents/ACSR/data/claire_dialogue/ipa_train.txt"
ipa_sentences = []
with open(ipa_path, "r", encoding="utf-8") as file:
    for line in file:
        ipa_sentences.append(line.strip())

# Use multiprocessing to syllabify IPA sentences
with multiprocessing.Pool(8) as pool:
    syllabized_ipa_sentences = pool.map(syllabify_ipa_worker, ipa_sentences)

# Save the syllabized IPA sentences to a file
syllabized_ipa_output_path = "/scratch2/bsow/Documents/ACSR/data/claire_dialogue/syllabized_ipa_train.txt"
with open(syllabized_ipa_output_path, "a", encoding="utf-8") as file:
    for syllables in syllabized_ipa_sentences:
        file.write(" ".join(syllables) + "\n")

print(f"Syllabized IPA sentences saved to {syllabized_ipa_output_path}")

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np

# Step 1: Preprocess the Data
class SyllableDataset(Dataset):
    def __init__(self, syllabized_ipa_sentences, seq_length=5):
        self.syllabized_ipa_sentences = syllabized_ipa_sentences
        self.seq_length = seq_length
        self.syllables = self._get_syllables()
        self.syllable_to_idx = {syllable: i for i, syllable in enumerate(self.syllables)}
        self.idx_to_syllable = {i: syllable for syllable, i in self.syllable_to_idx.items()}
        self.vocab_size = len(self.syllables)
        self.data = self._create_sequences()

    def _get_syllables(self):
        # Flatten the list of syllabized sentences and count syllables
        all_syllables = [syllable for sentence in self.syllabized_ipa_sentences for syllable in sentence]
        syllable_counts = Counter(all_syllables)
        return sorted(syllable_counts.keys())  # Sort for consistent ordering

    def _create_sequences(self):
        # Create input-output pairs
        sequences = []
        for sentence in self.syllabized_ipa_sentences:
            for i in range(len(sentence) - self.seq_length):
                input_seq = sentence[i:i + self.seq_length]
                output_seq = sentence[i + self.seq_length]
                sequences.append((input_seq, output_seq))
        return sequences

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq, output_seq = self.data[idx]
        input_indices = [self.syllable_to_idx[syllable] for syllable in input_seq]
        output_index = self.syllable_to_idx[output_seq]
        return torch.tensor(input_indices, dtype=torch.long), torch.tensor(output_index, dtype=torch.long)

# Step 2: Build the Model
class NextSyllableLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(NextSyllableLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(lstm_out[:, -1, :])  # Use the last hidden state
        return logits

# Step 3: Train the Model
def train_model(dataset, model, epochs=10, batch_size=32, learning_rate=0.001):
    # Split data into training and validation sets
    train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Print training loss
        avg_loss = total_loss / len(train_loader)
        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs, targets).item()
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == targets).sum().item()
                total += targets.size(0)
            avg_val_loss = val_loss / len(val_loader)

        val_accuracy = correct / total
        print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {avg_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


In [22]:
# Step 4: Main Script
if __name__ == "__main__":
    # Load syllabized IPA sentences
    syllabized_ipa_sentences = []
    with open("/scratch2/bsow/Documents/ACSR/data/claire_dialogue/syllabized_ipa_train.txt", "r", encoding="utf-8") as file:
        for line in file:
            syllabized_ipa_sentences.append(line.strip().split())
        
    # Create dataset
    dataset = SyllableDataset(syllabized_ipa_sentences, seq_length=5)

    # Initialize model
    model = NextSyllableLSTM(
        vocab_size=dataset.vocab_size,
        embedding_dim=64,
        hidden_dim=128,
        num_layers=1
    )

    # Train the model
    train_model(dataset, model, epochs=100, batch_size=32, learning_rate=0.001)

    # Save the model
    torch.save(model.state_dict(), "next_syllable_lstm.pth")
    print("Model saved to next_syllable_lstm.pth")

Epoch [1/100], Train Loss: 4.1513, Validation Loss: 3.8234, Validation Accuracy: 0.2243
Epoch [2/100], Train Loss: 3.6539, Validation Loss: 3.6260, Validation Accuracy: 0.2563
Epoch [3/100], Train Loss: 3.4418, Validation Loss: 3.5114, Validation Accuracy: 0.2728
Epoch [4/100], Train Loss: 3.2739, Validation Loss: 3.4621, Validation Accuracy: 0.2872
Epoch [5/100], Train Loss: 3.1221, Validation Loss: 3.4223, Validation Accuracy: 0.2957
Epoch [6/100], Train Loss: 2.9802, Validation Loss: 3.4220, Validation Accuracy: 0.2954
Epoch [7/100], Train Loss: 2.8470, Validation Loss: 3.4073, Validation Accuracy: 0.3023
Epoch [8/100], Train Loss: 2.7193, Validation Loss: 3.4322, Validation Accuracy: 0.2972
Epoch [9/100], Train Loss: 2.5957, Validation Loss: 3.4483, Validation Accuracy: 0.3001
Epoch [10/100], Train Loss: 2.4755, Validation Loss: 3.4697, Validation Accuracy: 0.3029
Epoch [11/100], Train Loss: 2.3610, Validation Loss: 3.5014, Validation Accuracy: 0.3029
Epoch [12/100], Train Loss: 2.

In [4]:
# Step 4: Main Script
if __name__ == "__main__":
    # Load syllabized IPA sentences
    syllabized_ipa_sentences = []
    with open("/scratch2/bsow/Documents/ACSR/data/claire_dialogue/syllabized_ipa_train.txt", "r", encoding="utf-8") as file:
        for line in file:
            syllabized_ipa_sentences.append(line.strip().split())
        
    # Create dataset
    dataset = SyllableDataset(syllabized_ipa_sentences, seq_length=5)

    # Initialize model
    model = NextSyllableLSTM(
        vocab_size=dataset.vocab_size,
        embedding_dim=64,
        hidden_dim=128,
        num_layers=2
    )

    # Train the model
    train_model(dataset, model, epochs=100, batch_size=32, learning_rate=0.001)

    # Save the model
    torch.save(model.state_dict(), "next_syllable_lstm.pth")
    print("Model saved to next_syllable_lstm.pth")

Epoch [1/100], Train Loss: 3.9495, Validation Loss: 3.7184, Validation Accuracy: 0.2355
Epoch [2/100], Train Loss: 3.5261, Validation Loss: 3.5341, Validation Accuracy: 0.2653
Epoch [3/100], Train Loss: 3.3096, Validation Loss: 3.4296, Validation Accuracy: 0.2786
Epoch [4/100], Train Loss: 3.1389, Validation Loss: 3.3811, Validation Accuracy: 0.2955
Epoch [5/100], Train Loss: 2.9917, Validation Loss: 3.3627, Validation Accuracy: 0.2972
Epoch [6/100], Train Loss: 2.8550, Validation Loss: 3.3741, Validation Accuracy: 0.2995
Epoch [7/100], Train Loss: 2.7289, Validation Loss: 3.3950, Validation Accuracy: 0.2992
Epoch [8/100], Train Loss: 2.6044, Validation Loss: 3.4274, Validation Accuracy: 0.3008
Epoch [9/100], Train Loss: 2.4852, Validation Loss: 3.4880, Validation Accuracy: 0.2972
Epoch [10/100], Train Loss: 2.3679, Validation Loss: 3.5473, Validation Accuracy: 0.2947
Epoch [11/100], Train Loss: 2.2556, Validation Loss: 3.6062, Validation Accuracy: 0.2957
Epoch [12/100], Train Loss: 2.

KeyboardInterrupt: 