In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container {width:100% !important;} </style>"))

In [2]:
import os
import datetime
import sys
import glob
import random
import pickle

import numpy         as np
import pandas        as pd

from scipy.fftpack   import fft
from pydub           import AudioSegment
from IPython.display import Audio

from pydub.silence   import split_on_silence
from pydub.silence   import detect_leading_silence

from tqdm            import tqdm

In [3]:
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
random.seed(42)

<h1 style="background-color:LightGreen;"> <center> <a id='start_cell'></a> Table Of Contents </center></h1>

[Create Datasets](#create_dataset) </br>

<h1 style="background-color:#3cA8EF;"> <center> <a id='create_dataset'></a> Choose Files </center> </h1>

<h1 style="color:red"> Choose speakers with at least 10 files </h1>

In [4]:
if sys.platform != "win32":
    # linux    
    dataset_path = r"/home/amitli/Datasets/CV_14/cv-corpus-14.0-2023-06-23-ru/cv-corpus-14.0-2023-06-23/ru/"                  
else:
    # windows    
    dataset_path = r"C:\Users\amitli\Datasets\cv-corpus-14.0-2023-06-23\ru"

In [37]:
CHOOSE_FILES = False
    
if CHOOSE_FILES is True:   
    train_df = pd.read_csv(dataset_path + "/train.tsv", delimiter="\t")
    test_df  = pd.read_csv(dataset_path + "/test.tsv",  delimiter="\t")
    df_all   = pd.concat([train_df, test_df])
    
    grouped         = df_all.groupby('client_id')
    filtered_groups = grouped.filter(lambda x: len(x) >= 10)
    df              = pd.DataFrame(filtered_groups)
    
    df.to_csv("cv_14_walkie_ru.csv")
else:
    df = pd.read_csv("cv_14_walkie_ru.csv")

In [38]:
print(f"Number of speakers: {len(set(df.client_id.values))}")
print(f"Dataframe size    : {len(df)}")

Number of speakers: 527
Dataframe size    : 29415


<h2> Choose that each speaker between 10 to 30 files: </h2>

In [40]:
grouped         = df.groupby('client_id')
filtered_groups = grouped.filter(lambda x: len(x) >= 10 and len(x) <= 30)
df              = pd.DataFrame(filtered_groups)
print(f"Number of speakers: {len(set(df.client_id.values))}")
print(f"Dataframe size    : {len(df)}")

Number of speakers: 260
Dataframe size    : 3314


<h1 style="background-color:#3cA8EF;"> <center> <a id='player'></a> Player </center> </h1>

In [7]:
from pygame import mixer
import time
import IPython
import os

pygame 2.4.0 (SDL 2.26.4, Python 3.10.0)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [8]:
num_to_freq_file = {}

num_to_freq_file[0]  = 'encoded_tones/audiocheck.net_sin_500Hz_-3dBFS_1.5s.wav'
num_to_freq_file[1]  = 'encoded_tones/audiocheck.net_sin_1000Hz_-10dBFS_1s.wav'
num_to_freq_file[2]  = 'encoded_tones/audiocheck.net_sin_1100Hz_-10dBFS_1s.wav'
num_to_freq_file[3]  = 'encoded_tones/audiocheck.net_sin_1200Hz_-10dBFS_1s.wav'
num_to_freq_file[4]  = 'encoded_tones/audiocheck.net_sin_1300Hz_-10dBFS_1s.wav'
num_to_freq_file[5]  = 'encoded_tones/audiocheck.net_sin_1400Hz_-10dBFS_1s.wav'
num_to_freq_file[6]  = 'encoded_tones/audiocheck.net_sin_1500Hz_-10dBFS_1s.wav'
num_to_freq_file[7]  = 'encoded_tones/audiocheck.net_sin_1600Hz_-10dBFS_1s.wav'
num_to_freq_file[8]  = 'encoded_tones/audiocheck.net_sin_1700Hz_-10dBFS_1s.wav'
num_to_freq_file[9]  = 'encoded_tones/audiocheck.net_sin_1800Hz_-10dBFS_1s.wav'
num_to_freq_file[10] = 'encoded_tones/audiocheck.net_sin_1900Hz_-10dBFS_1s.wav'

for f in num_to_freq_file.values():
    if not os.path.exists(f):
        print(f"Missing encoding file: {f}")    

In [13]:
def play_file(filename, fileLengthInSeconds, log_file):
    mixer.music.load(filename)
    mixer.music.play()
    time.sleep(fileLengthInSeconds)    
    time.sleep(0.05)
    while mixer.music.get_busy():  
        str_log = "\n---> busy - wait -----\n"
        print(str_log)
        log_file.write(str_log)
        time.sleep(0.05)
    #time.sleep(0.05)

In [10]:
def play_beep(log_file, start_time):

    # log start
    end_time = time.time()
    str_log   = f"\nStart play beep, Elapse: {round(end_time-start_time, 3)}"        
    print(str_log)
    log_file.write(str_log)
    
    # play beep
    play_file(num_to_freq_file[1], 1, log_file)
    
    # log end
    end_time = time.time()
    str_log   = f"\nEnd play beep, Elapse: {round(end_time-start_time, 3)}"        
    print(str_log)
    log_file.write(str_log)                    

In [17]:
def play_language(df, dataset_path,  start_from_client=0):
        
    current_date = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')        
    all_clients  = list(set(df.client_id.values))
    all_clients.sort()

    mixer.init()     
    start_time = time.time()        
    with open(f"log_{current_date}.txt", "w") as f:
        
        play_beep(f, start_time)
        time.sleep(0.05)  
        play_beep(f, start_time)
        time.sleep(0.05)  
        
        for c_i in range(len(all_clients)):  
            
            if c_i < start_from_client:
                continue
            
            client_id    = all_clients[c_i]
            df_client    = df[df.client_id == client_id]            
            client_files = df_client.path.values
                                                            
            str_log   = f"\n\nClient_Id: {client_id} [{c_i+1}/{len(all_clients)}] Files: {len(client_files)}"        
            print(str_log)
            f.write(str_log)
                          
            for file_i in range(len(client_files)):      
                
                # get file params
                file         = client_files[file_i]
                fullFilePath = f"{dataset_path}/clips/{file}"
                fileLength   = AudioSegment.from_mp3(fullFilePath).duration_seconds

                # play beep
                play_beep(f, start_time)
                time.sleep(0.05)  
                
                # play file
                end_time = time.time()
                str_log = f"\n\t[{file_i+1}/{len(client_files)}] Start Play file: {file}, (length: {fileLength} seconds), Elapse: {round(end_time-start_time, 3)}"
                print(str_log)
                f.write(str_log)
                                
                play_file(fullFilePath, fileLength, f)                
                
                end_start = time.time()
                str_log = f"\n\t[{file_i+1}/{len(client_files)}] ---> Played file: {file}, length: {fileLength} seconds, Elapse: {round(end_start-start_time, 3)}"
                print(str_log)
                f.write(str_log)
                
                # sleep (prevent blocking)
                time.sleep(2)  
                              
            
        end_time = time.time()        
        str_log = f"\nClose File, Elapse: {round(end_time-start_time, 3)}"
        print(str_log)
        f.write(str_log)

        f.close()
        

In [None]:
#AudioSegment.from_mp3(r"C:\Users\amitli\Datasets\cv-corpus-14.0-2023-06-23\ru\clips\common_voice_ru_21107819.mp3")
#AudioSegment.from_mp3(r"C:\Users\amitli\Datasets\cv-corpus-14.0-2023-06-23\ru\clips\common_voice_ru_21107820.mp3")

In [19]:
play_language(df, dataset_path, start_from_client=100)        


Start play beep, Elapse: 0.001

End play beep, Elapse: 1.054

Start play beep, Elapse: 1.104

End play beep, Elapse: 2.154


Client_Id: 5f5e9ef87849e782484699655dd628743113bb94f6d2f108e6b5653fe74006818d9b12cc8b7c1883b196e4179e7b2031819289243e458978d95abc5aa4261c38 [101/527] Files: 12

Start play beep, Elapse: 3.038

End play beep, Elapse: 4.089

	[1/12] Start Play file: common_voice_ru_25219308.mp3, (length: 7.74 seconds), Elapse: 4.14

	[1/12] ---> Played file: common_voice_ru_25219308.mp3, length: 7.74 seconds, Elapse: 11.934

Start play beep, Elapse: 15.103

End play beep, Elapse: 16.154

	[2/12] Start Play file: common_voice_ru_25219309.mp3, (length: 4.536 seconds), Elapse: 16.205

	[2/12] ---> Played file: common_voice_ru_25219309.mp3, length: 4.536 seconds, Elapse: 20.792

Start play beep, Elapse: 23.953

End play beep, Elapse: 25.004

	[3/12] Start Play file: common_voice_ru_25219310.mp3, (length: 4.428 seconds), Elapse: 25.054

	[3/12] ---> Played file: common_voice_ru_2521931

KeyboardInterrupt: 

<h1 style="background-color:#3cA8EF;"> <center><a id='parseLog'></a> Parse player log </center> </h1>

In [None]:
def parse_log(log_file):
    
    arr_clients            = []   # arr of client_id [may be duplicate - depends on num of files per client]
    arr_cv                 = []   # cv file
    arr_time               = []   # cv file length    
    arr_num_cv_per_speaker = []   # for each speaker - number of speechs
    arr_start_speech_time  = []   # for each speech - start time
    arr_end_speech_time    = []

    with open(log_file) as f:
        lines = f.readlines()

        counter     = 0
        last_client = None
        start_time  = None        
        
        for line in lines:

            if line.find("Client_Id") != -1:
                
                client      = line[line.find(": ")+2 : line.find(" [")]
                last_client = client
                if counter > 0:
                    arr_num_cv_per_speaker.append(counter) 
                    counter = 0
                    
                 
            elif line.find("Start Play file:") != -1:                     
                start_time = float(line[line.find("Elapse:")+8:])                

                    
            elif line.find("---> Played file") != -1:                
                cv_file = line[line.find(": c")+2 : line.find(", ")]
                length  = float(line[line.find("length:")+8 : line.find(" seconds")])                
                end_t   = float(line[line.find("Elapse: ")+8:])                
                counter = counter + 1
                
                arr_clients.append(last_client)
                arr_cv.append(cv_file)
                arr_time.append(length)                
                arr_end_speech_time.append(end_t)
                if start_time is not None:
                    arr_start_speech_time.append(start_time)
                    start_time = None
                else:
                    print(f"Error, start_time is None")
           
            
    arr_num_cv_per_speaker.append(counter)         
    print(f"Total number of speakers: {len(set(arr_clients))}, files: {len(set(arr_cv))}, langs: {len(set(arr_lang))}")
    print(f"len(arr_start_speech_time) = {len(arr_start_speech_time)}")
    return arr_clients, arr_cv, arr_time,  arr_num_cv_per_speaker, arr_start_speech_time, arr_end_speech_time

In [None]:
NIGHT_RUN_1 = r"C:\Users\amitli\Repo\WalkieTalkieRecorder\log_01_04_2024_15_03_43.txt"
arr_clients, arr_cv, arr_time,  arr_num_cv_per_speaker, arr_start_speech_time, arr_end_speech_time = parse_log(NIGHT_RUN_1)

<h1 style="background-color:#3cA8EF;"> <center> <a id='Reciever'></a> Reciever </center> </h1>

In [None]:
from dataclasses        import dataclass, asdict
from concurrent.futures import ThreadPoolExecutor
from pydub              import AudioSegment,silence

import datetime
import pyaudio
import wave

In [None]:

@dataclass
class StreamParams:
    format: int            = pyaudio.paInt16
    channels: int          = 1
    rate: int              = 8000
    frames_per_buffer: int = 1024
    input: bool            = True
    output: bool           = False

    def to_dict(self) -> dict:
        return asdict(self)

class Recorder:
    """Recorder uses the blocking I/O facility from pyaudio to record sound
    from mic.
    Attributes:
        - stream_params: StreamParams object with values for pyaudio Stream
            object
    """
    def __init__(self, stream_params: StreamParams) -> None:
        self.stream_params = stream_params
        self._pyaudio      = None
        self._stream       = None
        self._wav_file     = None
        self._counter      = 0

    def record(self, duration: int, save_path: str, num_files_to_create: int) -> None:
        """Record sound from mic for a given amount of seconds.
        :param duration: Number of seconds we want to record for
        :param save_path: Where to store recording
        """
        print("Start recording...")
        self.save_path = save_path
        self._create_recording_resources()
        self._write_wav_file_reading_from_stream(save_path, duration, num_files_to_create)
        self._close_recording_resources()
        print("Stop recording")

    def create_current_wav_file(self):
        
        self._counter = self._counter + 1 
        the_time  = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M_%S%z")
        ctr_str   = f'{self._counter:04}'
        save_path = self.save_path.replace(".wav",f"_C_{ctr_str}_D_{the_time}.wav")        
        self._create_wav_file(save_path)
    
    def _create_recording_resources(self) -> None:
        self._pyaudio = pyaudio.PyAudio()
        self._stream = self._pyaudio.open(**self.stream_params.to_dict())
        self.create_current_wav_file()

    def _create_wav_file(self, save_path: str):
        print(f"creating new wav: {save_path}")
        self._wav_file = wave.open(save_path, "wb")
        self._wav_file.setnchannels(self.stream_params.channels)
        self._wav_file.setsampwidth(self._pyaudio.get_sample_size(self.stream_params.format))
        self._wav_file.setframerate(self.stream_params.rate)


    def close_current_wav_file(self, wav_file) -> None:
        wav_file.close()

    def _write_wav_file_reading_from_stream(self, save_path: str, duration: int, num_files_to_create: int) -> None:
        with ThreadPoolExecutor(max_workers = 5) as executor:
            for i in range(num_files_to_create):
                for _ in range(int(self.stream_params.rate * duration / self.stream_params.frames_per_buffer)):
                    audio_data = self._stream.read(self.stream_params.frames_per_buffer)
                    self._wav_file.writeframes(audio_data)
                    
                executor.submit(self.close_current_wav_file, self._wav_file)
                if i < num_files_to_create-1:
                    self.create_current_wav_file()

    def _close_recording_resources(self) -> None:
        self._stream.close()
        self._pyaudio.terminate()


In [None]:
stream_params      = StreamParams()
stream_params.rate = 16000
recorder           = Recorder(stream_params)
recorder.record(60, "/home/amitli/Datasets/speakathon/audio.wav", num_files_to_create=60*18)

<h1 style="background-color:#3cA8EF;"> <center><a id='ParseRecords'></a>  Parse recordings </center> </h1>

In [None]:
def get_sorted_files(files_path):
    all_files = glob.glob(files_path)
    all_files.sort()    
    return all_files    

In [None]:
NIGHT_RUN_INPUT_FOLDER = r"/home/amitli/Debug/25_26_jul/Night_25_26/*"

In [None]:
all_files = get_sorted_files(NIGHT_RUN_INPUT_FOLDER)
one_file  = AudioSegment.from_wav(all_files[0]) 
for i in range(1, len(all_files)):
    one_file = one_file + AudioSegment.from_wav(all_files[i])   

In [None]:
NIGHT_RUN_1_ONE_FILE = r"/home/amitli/Debug/25_26_jul/One_file/One_file.wav"

In [None]:
one_file.export(NIGHT_RUN_1_ONE_FILE, format="wav")

In [None]:
arr_start_speech_time[:5]

In [None]:
#AudioSegment.from_wav(r"/home/amitli/Debug/25_26_jul/Night_25_26/audio_C_0001_D_25_07_2023_15_18_31.wav")[9.9*1000:11*1000]

In [None]:
full_wav              = AudioSegment.from_wav(NIGHT_RUN_1_ONE_FILE)
mp3_len               = arr_time
num_of_speakers       = len(set(arr_clients))
num_of_mp3_per_speker = arr_num_cv_per_speaker


start_rec_time_ms = int(9.9*1000) - arr_start_speech_time[0] * 1000
arr_results       = []
counter           = 0
end               = 0

for i_speaker in range(num_of_speakers):
   
    for i_mp3 in range(num_of_mp3_per_speker[i_speaker]):
                
        # --- speech
        start = start_rec_time_ms + arr_start_speech_time[counter]*1000
        #end   = arr_end_speech_time[counter]*1000 + start_rec_time_ms
        end   = start + mp3_len[counter]*1000   
        
        
        if end > len(full_wav):
            break
                   
        arr_results.append(full_wav[start-300 : end])                   
        counter = counter + 1
           

print(f"Finished at = {end} / {len(full_wav)}, Found: {len(arr_results)}")

In [None]:
test = 2354
arr_results[test]

In [None]:
print(f"{arr_cv[test]}, {arr_time[test]}, recorded: {arr_results[test].duration_seconds}")
AudioSegment.from_mp3(f"{ARABIC_PATH}/clips/{arr_cv[test]}")

In [None]:
last_speaker  = 0
export_path   = "/home/amitli/Debug/25_26_jul/Outputs_wav"
full_path     = ""
for i in tqdm(range(2993)):
    
    speaker     = arr_clients[i]
    if speaker != last_speaker:
        expoer_folder = speaker
        full_path     = f"{export_path}/{speaker}"
        if not os.path.exists(full_path):  
            os.makedirs(full_path)        
        
    last_speaker = speaker
    tactic_file  = arr_results[i]
    cv_name      = arr_cv[i]
    lang         = arr_lang[i]
    
    file_name    = f"tactic_{lang}_{cv_name}"
    #arr_results[i].export(f"{full_path}/{file_name}", format="mp3")
    arr_results[i].export(f"{full_path}/{file_name[:-4]}.wav", format="wav")    

<h1 style="background-color:#3cA8EF;"> <center> <a id='Plot_dBFS'></a>  Plot (dBFS) </center> </h1>

In [None]:
import plotly.graph_objects as go
import numpy as np
import wave

def read_wav_file(file_path):
    with wave.open(file_path, "rb") as wav_file:
        num_frames   = wav_file.getnframes()
        sample_width = wav_file.getsampwidth()
        sample_rate  = wav_file.getframerate()
        audio_data   = np.frombuffer(wav_file.readframes(num_frames), dtype=np.int16) 
    return audio_data, sample_rate


def convert_to_dbfs(audio_data):
    max_value = np.max(np.abs(audio_data))
    dbfs = 20 * np.log10(audio_data.astype(np.float32) / max_value)
    return dbfs


def plot_dBFS(audio_data, sample_rate):
    duration = len(audio_data) / sample_rate
    time = np.linspace(0, duration, len(audio_data))
    dbfs = convert_to_dbfs(audio_data)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=time, y=dbfs, mode="lines"))
    fig.update_layout(
        title="dBFS Plot",
        xaxis_title="Time (s)",
        yaxis_title="dBFS",
    )
    return fig

    
file_path = r"/home/amitli/Debug/25_26_jul/Night_25_26/audio_C_0001_D_25_07_2023_15_18_31.wav"
audio_data, sample_rate = read_wav_file(file_path)
fig                     = plot_dBFS(audio_data, sample_rate)
fig.show()    

[Go to start](#start_cell) </br>
[Parse log](#parseLog) </br>
