In [None]:
!pip install git+https://github.com/openai/whisper.git


In [None]:
!pip install jiwer


In [None]:
!pip install python-Levenshtein


In [None]:
from google.colab import drive

# Connect to Google Drive
drive.mount('/content/drive')

In [None]:
import os
import csv
import whisper

def transcribe_audio_to_text(audio_path):
    """
    Transcribe the given audio file to text using the Whisper model.
    """
    # Load the Whisper model
    model = whisper.load_model("large-v3")

    # Load and transcribe the audio file
    result = model.transcribe(audio_path, language="fa")

    # Extract the transcribed text
    transcribed_text = result["text"]

    return transcribed_text

def transcribe_folder_to_csv(folder_path, csv_path):
    """
    Transcribe all .wav files in the given folder and store the results in a CSV file,
    skipping files that have already been transcribed.
    """
    already_transcribed = set()
    # Check if the CSV file exists and read already transcribed filenames
    if os.path.exists(csv_path):
        with open(csv_path, mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader, None)  # Skip the header
            already_transcribed = {rows[0] for rows in reader}

    # Open the CSV file in append mode
    with open(csv_path, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # If the file was newly created, write the header
        if not already_transcribed:
            writer.writerow(["Filename", "Transcribed Text"])

        # Loop through all files in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".wav") and filename not in already_transcribed:
                # Full path to the current audio file
                audio_path = os.path.join(folder_path, filename)

                # Transcribe the audio to text
                transcribed_text = transcribe_audio_to_text(audio_path)

                # Write the filename and transcribed text to the CSV
                writer.writerow([filename, transcribed_text])
                print(f"Transcribed: {filename}")

# Example usage
folder_path = "/content/drive/MyDrive/AutoMOS/VITS"
csv_path = "/content/drive/MyDrive/AutoMOS/VITS/Transcribed_VITS.csv"
transcribe_folder_to_csv(folder_path, csv_path)


In [None]:
!ls /content/drive/MyDrive/AutoMOS/XTTS/

In [None]:
import pandas as pd

def merge_csv_files(transcribed_csv_path, metadata_csv_path, output_csv_path):
    """
    Merge two CSV files based on the "Filename" column and save the result to a new CSV file.

    Args:
        transcribed_csv_path (str): The path to the CSV file with transcribed text.
        metadata_csv_path (str): The path to the CSV file with additional metadata.
        output_csv_path (str): The path where the merged CSV file will be saved.
    """
    # Load the CSV files into DataFrames
    transcribed_df = pd.read_csv(transcribed_csv_path)
    metadata_df = pd.read_csv(metadata_csv_path)

    # Merge the DataFrames on the "Filename" column
    merged_df = pd.merge(transcribed_df, metadata_df, on="Filename", how="inner")

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_csv_path, index=False)
    print(f"Merged file saved to {output_csv_path}")

# Example usage
transcribed_csv_path = "/content/drive/MyDrive/AutoMOS/XTTS/Transcribed_Metadata_XTTS - Transcribed_Metadata_XTTS.csv"
metadata_csv_path = "/content/drive/MyDrive/AutoMOS/XTTS/Metadata_XTTS.csv"
output_csv_path = "Merged_XTTS.csv"

merge_csv_files(transcribed_csv_path, metadata_csv_path, output_csv_path)


In [None]:
import whisper
import os
import csv

def transcribe_audio_to_text(audio_path):
    """
    Transcribe the given audio file to text using the Whisper model.

    Args:
        audio_path (str): The path to the audio file to transcribe.

    Returns:
        str: The transcribed text.
    """
    # Load the Whisper model
    model = whisper.load_model("large-v3")

    # Load and transcribe the audio file
    result = model.transcribe(audio_path, language="fa")

    # Extract the transcribed text
    transcribed_text = result["text"]

    return transcribed_text

def transcribe_folder_to_csv(folder_path, csv_path):
    """
    Transcribe all .wav files in the given folder and store the results in a CSV file.

    Args:
        folder_path (str): The path to the folder containing .wav files.
        csv_path (str): The path to the CSV file to store the transcriptions.
    """
    # Prepare to write to the CSV file
    with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Filename", "Transcribed Text"])

        # Loop through all files in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".wav"):
                # Full path to the current audio file
                audio_path = os.path.join(folder_path, filename)

                # Transcribe the audio to text
                transcribed_text = transcribe_audio_to_text(audio_path)

                # Write the filename and transcribed text to the CSV
                writer.writerow([filename, transcribed_text])
                print(f"Transcribed: {filename}")

# Path to the folder containing .wav files
folder_path = "/content/drive/MyDrive/AutoMOS/XTTS"

# Path to the CSV file where results will be stored
csv_path = "/content/drive/MyDrive/AutoMOS/XTTS/Transcribed_XTTS.csv"

# Transcribe all .wav files in the folder and store the results in the CSV file
transcribe_folder_to_csv(folder_path, csv_path)

In [None]:
import pandas as pd

# Read the two CSV files into pandas DataFrames
metadata_vits = pd.read_csv('/content/drive/MyDrive/AutoMOS/VITS/Metadata_VITS.csv')
transcribed_metadata_vits = pd.read_csv('/content/drive/MyDrive/AutoMOS/VITS/Transcribed_VITS.csv')

# Merge the DataFrames based on the 'Filename' column
merged_df_vits = pd.merge(metadata_vits, transcribed_metadata_vits, on='Filename', how='left')

# Add a new column named 'Transcribed' to 'Metadata_VITS.csv'
merged_df_vits['Transcribed'] = merged_df_vits['Transcribed Text']

# Drop unnecessary columns from the merged DataFrame
merged_df_vits.drop(columns=['Transcribed Text'], inplace=True)

# Save the updated DataFrame to a new CSV file
merged_df_vits.to_csv('Merged_Metadata_VITS.csv', index=False)

print("Merged CSV file saved successfully.")


Sort the result and merge it to original transcribe

In [None]:
import csv

def sort_csv_by_filename(csv_path, sorted_csv_path):
    """
    Sort a CSV file by filenames mentioned in one of its columns.

    Args:
        csv_path (str): The path to the original CSV file.
        sorted_csv_path (str): The path to save the sorted CSV file.
    """
    with open(csv_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Capture the header
        records = list(reader)

    # Assuming the filename is in the first column, sort the records
    # Extracting the numeric part of the filename for sorting
    records.sort(key=lambda x: int(x[0][1:-4]))  # Assumes filenames are like 'v1.wav', 'v2.wav', etc.

    # Write the sorted records back to a new CSV
    with open(sorted_csv_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write the header first
        writer.writerows(records)  # Then write the sorted records

# Paths to the original and sorted CSV files
csv_path = "/content/drive/MyDrive/test voice/result.csv"
sorted_csv_path = "/content/drive/MyDrive/test voice/sorted_results.csv"

# Sort the CSV file
sort_csv_by_filename(csv_path, sorted_csv_path)


WER and CER

In [None]:
Merged_VITS=pd.read_csv('/content/Merged_VITS.csv')
Merged_VITS

In [None]:
import csv
from jiwer import wer, cer

def calculate_wer_cer_from_csv(csv_path):
    """
    Calculate the average Word Error Rate (WER) and Character Error Rate (CER)
    from a CSV file containing 'Transcribed Text' and 'Original Transcribed' columns.

    Args:
        csv_path (str): The path to the CSV file.
    """
    wer_values = []
    cer_values = []

    with open(csv_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            transcribed_text = row['text']
            original_text = row['Transcribed Text']
            # Calculate WER and CER for this row and append to lists
            wer_values.append(wer(original_text, transcribed_text))
            cer_values.append(cer(original_text, transcribed_text))

    # Calculate average WER and CER across all entries
    average_wer = sum(wer_values) / len(wer_values)
    average_cer = sum(cer_values) / len(cer_values)

    print(f"Average Word Error Rate (WER): {average_wer:.2f}")
    print(f"Average Character Error Rate (CER): {average_cer:.2f}")

# Path to your CSV file
csv_path = "/content/Merged_VITS.csv"

# Calculate and print the average WER and CER
calculate_wer_cer_from_csv(csv_path)


In [None]:
Merged_XTTS=pd.read_csv('/content/Merged_XTTS.csv')
Merged_XTTS

In [None]:
import csv
from jiwer import wer, cer

def calculate_wer_cer_from_csv(csv_path):
    """
    Calculate the average Word Error Rate (WER) and Character Error Rate (CER)
    from a CSV file containing 'Transcribed Text' and 'Original Transcribed' columns.

    Args:
        csv_path (str): The path to the CSV file.
    """
    wer_values = []
    cer_values = []

    with open(csv_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            transcribed_text = row['text']
            original_text = row['Transcribed']
            # Calculate WER and CER for this row and append to lists
            wer_values.append(wer(original_text, transcribed_text))
            cer_values.append(cer(original_text, transcribed_text))

    # Calculate average WER and CER across all entries
    average_wer = sum(wer_values) / len(wer_values)
    average_cer = sum(cer_values) / len(cer_values)

    print(f"Average Word Error Rate (WER): {average_wer:.2f}")
    print(f"Average Character Error Rate (CER): {average_cer:.2f}")

# Path to your CSV file
csv_path = "/content/Merged_XTTS.csv"

# Calculate and print the average WER and CER
calculate_wer_cer_from_csv(csv_path)

In [None]:
import matplotlib.pyplot as plt

# Results
models = ['VITS', 'xTTS']
wer_values = [0.47, 0.32]  # WER for VITS and xTTS
cer_values = [0.18, 0.10]  # CER for VITS and xTTS

# Creating the plot
fig, ax = plt.subplots(figsize=(10, 6))

# Setting the positions and width for the bars
positions = range(len(models))
width = 0.4  # the width of the bars

# Plotting WER
plt.bar([p - width / 2 for p in positions], wer_values, width, alpha=0.7, label='WER')

# Plotting CER
plt.bar([p + width / 2 for p in positions], cer_values, width, alpha=0.7, label='CER')

# Adding some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('Rates')
plt.title('WER and CER Comparison between VITS and xTTS Models')
plt.xticks(positions, models)
plt.legend()

# Adding numerical values on top of the bars for clarity
for i in range(len(models)):
    plt.text(i - width / 2, wer_values[i] + 0.02, f"{wer_values[i]:.2f}", ha='center')
    plt.text(i + width / 2, cer_values[i] + 0.02, f"{cer_values[i]:.2f}", ha='center')

plt.tight_layout()  # Adjusts the plot to ensure everything fits without overlap

plt.show()  # Displays the plot


# Reuslt

The comparative analysis between the VITS and xTTS text-to-speech models was conducted using Whisper large_v3 for speech-to-text transcription, aiming to evaluate the performance of synthesized audio outputs across various content categories. These categories included News, Dialogue, Education, Public Announcements, Accessibility, Advertisements, and Navigation. By transcribing the synthesized audios of these two models and calculating the Word Error Rate (WER) and Character Error Rate (CER), the study sought to provide a comprehensive understanding of each model's effectiveness in generating natural, accurate speech across a diverse range of applications.

The results revealed that the xTTS model outperformed the VITS model in terms of both WER and CER, with the xTTS model achieving a WER of 0.32 and a CER of 0.10, compared to the VITS model's WER of 0.47 and CER of 0.18. This indicates that the xTTS model has a higher accuracy in mimicking human speech, making fewer mistakes in word selection and character representation in the transcribed text. Such findings suggest that the xTTS model may be more reliable for applications requiring high precision in speech synthesis, such as educational content, public announcements, and accessibility features, where clear and accurate communication is paramount.

This comparison sheds light on the potential applications and suitability of each model for various tasks. The superior performance of the xTTS model, particularly in generating content for Education, Navigation, and Accessibility, highlights its capacity to produce more intelligible and natural-sounding speech. On the other hand, while the VITS model exhibits a higher error rate, it still holds value for applications where the ultimate realism and naturalness of the speech may not be as critical. This study underscores the importance of selecting the right text-to-speech model based on the specific needs of the application, ensuring that the synthesized audio meets the desired criteria for accuracy and intelligibility.