In [None]:
#Please import the following libraries
%pip install numpy
%pip install soundfile
%pip install sounddevice

#### Note:

It was noted that the above pip/python3 imports work fine with Windows 11 VS Code environment. Google Colab environment may require the following imports, in lieu of the "sounddevice" import above:

* The code would slightly need to adjust for Google Colab.

```python 
from IPython.display import Audio, display


In [None]:
"""Plan for the project: """

# 1. Run a loop from 1 to 708
# 2. In each iteration, fetch a .wav file from path "./UrduPhoneticSpeechCorpus/Recordings-Continuous/{file-name}.wav", where the file name would be c{i}.wav like c4.wav
# 3. Load the .wav file
# 4. Increase the playback speed of the audio by 1.75x and then play it
# 5. Then prompt the user for the following questions quickly:
#       1) Choose the speaker (give options [1,2,3,4,5,6,7,8,9,10])
#       2) If the gender-list has the gender for the speaker, skip. Otherwise, ask for the gender of the speaker (give options [M, F]).
#       3) Was there any background noise? (give options [clear, slight, moderate, heavy])
# 6. Save the information taken for each file in a dictionary with the following format:
#       {file_id: i, speaker: speaker_number, speaker_gender: speaker_gender, background_noise: background_noise, duration: duration_of_audio_in_seconds_to_two_decimal_places}
# 7. Finally, save the dictionary in a json file named "metadata_dataset-1.json"
# 8. Lastly, using the data, count the total number of speakers in the entire data, their genders, the overall noise ratios, and the total duration of the dataset.
# 9. Save this information in a json file named "metadata_dataset-1_summary.json"
# Note: Keep saving/updating the metadata_dataset-1.json and metadata_dataset-1_summary.json in each iteration to avoiding losing the data in case of any error.

In [None]:

import json
import soundfile as sf
import sounddevice as sd
import numpy as np

speaker_gender = ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U']

# BEGIN: Loop from 1 to 708
for i in range(1, 709):

    # Fetch .wav file
    file_path = f"./UrduPhoneticSpeechCorpus/Recordings-Continuous/wav/c{i}.wav"
    
    # Load .wav file
    audio, sample_rate = sf.read(file_path)
    
    # Increase playback speed by 1.75x
    audio_speed = np.interp(np.arange(0, len(audio), 1), np.arange(0, len(audio)), audio)
    
    # Play audio
    sd.play(audio_speed, sample_rate)
    
    # Prompt user for speaker, gender, and background noise
    speaker = "1" #speaker = input("Choose the speaker: ")
    print(f"Speaker {speaker} is speaking...")
    if speaker_gender[int(speaker) - 1] == 'U':
        speaker_gender[int(speaker) - 1] = input("Gender of the speaker: ")
    background_noise = "C" #background_noise = input("Any background noise? (options: [C, S, M, H]): ")
    
    # Calculate duration of audio
    duration = len(audio) / sample_rate
    
    # Create dictionary with information
    info = {
        "file_id": i,
        "speaker": speaker,
        "speaker_gender": speaker_gender[int(speaker) - 1],
        "background_noise": background_noise,
        "duration": round(duration, 2)
    }
    
    # Save information in metadata_dataset-1.json
    with open("metadata_dataset-1.jsonl", "a") as file:
        json.dump(info, file)
        file.write("\n")
    
# Calculate total number of speakers, genders, noise ratios, and total duration
total_samples = 0
speakers = []
noise_ratios = []
total_duration = 0

# Read metadata_dataset-1.json
with open("metadata_dataset-1.jsonl", "r") as file:
    for line in file:
        info = json.loads(line)
        total_samples += 1
        speakers.append(info["speaker"])
        noise_ratios.append(info["background_noise"])
        total_duration += info["duration"]

# Count unique genders
unique_speakers = len(set(speakers))
unique_speakers_with_gender = [{"speaker": speaker, "gender": speaker_gender[int(speaker) - 1]} for speaker in set(speakers)]

# Calculate noise ratio percentages
clear_noise = noise_ratios.count("C") / len(noise_ratios) * 100
slight_noise = noise_ratios.count("S") / len(noise_ratios) * 100
moderate_noise = noise_ratios.count("M") / len(noise_ratios) * 100
heavy_noise = noise_ratios.count("H") / len(noise_ratios) * 100

# Create summary dictionary
summary = {
    "total_samples": total_samples,
    "total_speakers": unique_speakers,
    "speakers": unique_speakers_with_gender,
    "noise" : {
        "clear_noise": round(clear_noise, 2),
        "slight_noise": round(slight_noise, 2),
        "moderate_noise": round(moderate_noise, 2),
        "heavy_noise": round(heavy_noise, 2)
    },
    "total_duration": round(total_duration, 2)
}

# Save summary in metadata_dataset-1_summary.json
with open("metadata_dataset-1_summary.json", "w") as file:
    json.dump(summary, file)

#Print Summary
print("The task is completed. Here is the summary:")
print(summary)