# Speech Prominence Detection
Speech Prominence Detection is the process of identifying the most important or prominent parts of speech in an audio signal. Prominence refers to the degree of emphasis or attention that a particular word or phrase receives in spoken language, which is often conveyed through variations in pitch, loudness, and timing. Speech Prominence Detection is an essential task in speech processing and natural language understanding, with a wide range of applications including speech recognition, sentiment analysis, and language translation. The goal of this project is to develop a machine learning model that can accurately identify the prominent parts of speech in a given audio signal. This project will involve feature extraction, model training, and evaluation, with the aim of achieving high accuracy and generalizability on a diverse range of speech datasets. The results of this project could have significant implications for improving speech recognition and understanding systems in real-world applications.

In [1]:
# imports
import numpy as np
import pandas as pd
import re
import os
import collections
import scipy
from scipy.signal import medfilt
from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

KeyboardInterrupt: 

In [None]:
from myfunctions import spectral_selection,  temporal_corr, get_labels_seq2seq
from myfunctions import spectral_corr, smooth, vocoder_func

In [None]:
data_dir = "../data/"
ger_test_dir = os.path.join(data_dir, "GER/test/")
ger_train_dir = os.path.join(data_dir, "GER/train/")
ita_test_dir = os.path.join(data_dir, "ITA/test/")
ita_train_dir = os.path.join(data_dir, "ITA/train/")

phn_dir = os.path.join(data_dir, "fisher-2000_FA_GT_ESTphnTrans_estStress/lab/txt/phn/")
dict_name = "nativeEnglishDict_gt100_manoj.syl"
stressLabelspath = data_dir + "FA_htkCorrectedLabWithFullAudio" + "/lab/mat/sylStress/"

In [None]:
# chech if the directories exist
if not os.path.exists(data_dir):
    print("Data directory does not exist")
if not os.path.exists(ger_train_dir):
    print("German Train directory does not exist")
if not os.path.exists(ita_train_dir):
    print("Italian Train directory does not exist")
if not os.path.exists(phn_dir):
    print("Phoneme directory does not exist")

In [None]:
ger_test_files = os.listdir(ger_test_dir)
ger_train_files = os.listdir(ger_train_dir)
ita_test_files = os.listdir(ita_test_dir)
ita_train_files = os.listdir(ita_train_dir)

In [None]:
# Compute features
twin = 5
t_sigma = 1.4
swin = 7
s_sigma = 1.5
mwin = 13
max_threshold = 25

vwlSB_num = 4
vowelSB = [1, 2, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17]
sylSB_num = 5
sylSB = [1, 2, 3, 4, 5, 6, 13, 14, 15, 16, 17, 18]

startWordFrame_all = []
spurtStartFrame_all = []
spurtEndFrame_all = []
vowelStartFrame_all = []
vowelEndFrame_all = []
eng_full_all = []
spurtStress_all = []

In [None]:
def get_data_array(phn_file):
    data_array = []
    try:
        fid = open(phn_file, 'r')
        data_array = np.loadtxt(fid, dtype={'names': ('a', 'b', 'c'), 'formats': ('f4', 'f4', 'S16')})
        fid.close
    except:
        print('File does not exist')
        return

    ghastly = []
    for i in range(len(data_array)):
        tuple_list = list(data_array[i])
        tuple_list[2] = tuple_list[2].decode()
        ghastly.append((tuple_list[0], tuple_list[1], tuple_list[2]))
    return np.array(ghastly)

In [None]:
def get_phone_data(data_array):
    phnTimes1 = [row[0] for row in data_array]
    phnTimes1 = np.array([phnTimes1]).T

    phnTimes2 = [row[1] for row in data_array]
    phnTimes2 = np.array([phnTimes2]).T

    phnTimes = np.hstack((phnTimes1, phnTimes2))
    phones = [row[2] for row in data_array]
    phones = np.array([phones])

    # Made them lowercase since the syl dictionary is in lowercase
    for i in range(0, len(phones[0])):
        phones[0][i] = phones[0][i].lower()
        
    origPhones = phones
    index = np.argwhere(origPhones[0] == 'sil')
    phones = phones[phones != 'sil']
    phones = np.array([phones])
    phones = phones.reshape(1, -1)

    phnTimes2 = np.delete(phnTimes2, index, axis=0)
    phnTimes = np.delete(phnTimes, index, axis=0)
    
    return phones, phnTimes

In [None]:
# Getting vowel data
def get_vowel_data(data_array):
    # VOWEL LIST
    vowelList = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'eh', 'er', 'ey', 'ih', 'iy', 'ow', 'oy', 'uh', 'uw']
    vowel_start_time = []
    vowel_end_time = []
    vowel = []

    for i in range(0, len(data_array)):
        if data_array[i][2].lower() in vowelList:
            vowel_start_time.append(data_array[i][0])
            vowel_end_time.append(data_array[i][1])
            vowel.append(data_array[i][2].lower())
            
    vowel_start_time = np.array([vowel_start_time])
    vowel_end_time = np.array([vowel_end_time])
    vowel = np.array([vowel])
    return vowel, vowel_start_time, vowel_end_time

In [None]:
def get_words(file_name):
    # Define the path to the transcript file
    trans_path = data_dir + "ISLEtrans.txt"

    # Read the contents of the transcript file
    with open(trans_path, 'r') as trans_file:
        trans_contents = trans_file.read()

    # Extract the lines containing the specified filename from the transcript
    lines = [line for line in trans_contents.split('\n') if file_name in line]

    # Extract the words from the lines and clean them up
    words = []
    for line in lines:
        _, word_list = line.split(' ', 1)
        words.extend(re.findall(r'\b\w+\b', word_list))
    words = [word.lower() for word in words]
    return words

In [None]:
def get_word_syls(words):
    d = collections.defaultdict(list)
    with open(data_dir + dict_name, 'r') as f:
        for line in f:
            key = line.split()[0]
            val = line.split('=')[1].strip()
            d[key].append(val)

    word_syls = []
    for i in range(len(words)):
        curr_word_syls = []
        if words[i] in d:
            curr_word_syls = d[words[i]]
        word_syls.append(curr_word_syls)
    return word_syls

In [None]:
def get_path_indices(words, word_syls, phones):
    newSuccessInds_all = []
    newSuccessInds_all2 = []

    prevSuccessInds_all = []
    prevSuccessInds_all.append(0)

    # I said got not goat
    for iterWord in range(0, len(words)):
        currWordSyls = word_syls[iterWord]
        countSuccess = 1

        for iterPrev in range(0, len(prevSuccessInds_all)):
            prevWordSyls = ""
            if prevSuccessInds_all[iterPrev] == 0:
                currPrevSylInds = []
            else:
                currPrevSylInds = prevSuccessInds_all[iterPrev]
                for iterPrevSyls in range(0, len(currPrevSylInds)):
                    temp = word_syls[iterPrevSyls]
                    prevWordSyls = prevWordSyls + \
                        temp[currPrevSylInds[iterPrevSyls]]+" "

            # iterating through the syllables of the current word
            for iterCurr in range(0, len(currWordSyls)):
                currTestWordSyls = prevWordSyls + currWordSyls[iterCurr]
                temp2 = currTestWordSyls.replace(' . ', ' ')
                
                
                inds = [m.start() for m in re.finditer(' ', temp2)]
                if len(inds) == 0:
                    inds = [len(temp2)]

                count = 1
                temp = []

                for iterTemp in range(len(inds)):
                    if iterTemp == 0:
                        temp1 = temp2[0:inds[iterTemp]]
                        # print(temp2 + "\t\t| " + temp1)
                    else:
                        temp1 = temp2[inds[iterTemp-1]+1:inds[iterTemp]]
                    if not ((np.unique(temp1) == ' ').any() or (len(temp1) == 0)):
                        temp.append(temp1)
                        count += 1
                        
                if iterTemp == len(inds) - 1 and len(inds) < len(currTestWordSyls):
                    temp1 = temp2[inds[iterTemp]+1:len(temp2)]
                    if not ((len(temp1) == 0) or (np.unique(temp1) == ' ').any()):
                        temp.append(temp1)
                        count = count+1

                if iterWord + 1 == len(words):
                    currPhones = phones[0, 0:len(phones[0])]
                else:
                    currPhones = phones[0][0:len(temp)]

                    
                flag = 1
                for iterFlag in range(0, len(currPhones), 1):
                    if len(currPhones) != len(temp):
                        flag = 0
                    else:
                        if currPhones[iterFlag] != temp[iterFlag]:
                            flag = 0
                if flag == 1:
                    if not currPrevSylInds == []:
                        for i in range(0, len(currPrevSylInds)):
                            #                            print('line 122::::::yes')
                            newSuccessInds_all.append(currPrevSylInds[i])
                    newSuccessInds_all.append(iterCurr)
                    newSuccessInds_all2.append(newSuccessInds_all)
                    newSuccessInds_all = []
                    countSuccess = countSuccess+1
                    
        prevSuccessInds_all = newSuccessInds_all2
        newSuccessInds_all2 = []
    if len(prevSuccessInds_all) == 0:
        return None, None
    return prevSuccessInds_all[0], currTestWordSyls

In [None]:
def get_syls_count(path_indices, currTestWordSyls, words, word_syls):
    sylCount = 1
    phnCount = 1
    spurtSyl = []  # spurtSylTimes= np.zeros((len(phnTimes),2))

    syls_word = np.zeros((1, len(path_indices)))
    spurtWordTimes = np.zeros((len(path_indices), 2))

    for iterPath in range(0, len(path_indices)):
        # current word and syllables
        currWord = words[iterPath]
        currWordSyls = word_syls[iterPath]

        currSyl = currWordSyls[path_indices[iterPath]]
        currSyl = currSyl.replace(' . ', '.')
        # print(currSyl)
        inds = [m.start() for m in re.finditer('\.', currSyl)]

        if len(inds) == 0:
            inds = [len(currSyl)]

        count = 0
        for iterTemp in range(0, len(inds)):
            if iterTemp == 0:
                temp1 = currSyl[0:inds[iterTemp]]
            else:
                temp1 = currSyl[inds[iterTemp-1]+1:inds[iterTemp]]
            if not (temp1 == ' ' or len(temp1) == 0):
                spurtSyl.append(temp1)
                sylCount = sylCount + 1
                count = count + 1
                
        if iterTemp is len(inds)-1 and len(inds) < len(currTestWordSyls):
            temp1 = currSyl[inds[iterTemp]+1:len(currSyl)]
            if not (temp1 == ' ' or len(temp1) == 0):
                spurtSyl.append(temp1)
                sylCount = sylCount + 1
                count = count + 1
        syls_word[0][iterPath] = count

    return syls_word, spurtSyl, spurtWordTimes

In [None]:
def get_spurts(spurtSyl, currTestWordSyls, phnTimes):
    phnCount = 1
    spurtSylTimes = np.zeros((len(spurtSyl), 2))

    for iterSyl in range(0, len(spurtSyl)):
        temp2 = spurtSyl[iterSyl]
        inds = [m.start() for m in re.finditer(' ', temp2)]
        if len(inds) == 0:
            inds = [len(temp2)]
        count = 1
        temp = []
        for iterTemp in range(0, len(inds)):
            if iterTemp == 0:
                temp1 = temp2[0:inds[iterTemp]]
            else:
                temp1 = temp2[inds[iterTemp-1]+1:inds[iterTemp]]
            if not (temp1 == ' ' or len(temp1) == 0):
                temp.append(temp1)
                count = count+1
        if iterTemp == len(inds)-1 and len(inds) < len(currTestWordSyls):
            temp1 = temp2[inds[iterTemp]+1:len(temp2)]
            if not (temp1 == ' ' or len(temp1) == 0):
                temp.append(temp1)
                count = count+1

        nPhns_syl = len(temp)
        spurtSylTimes[iterSyl, 0] = phnTimes[phnCount-1, 0]
        phnCount = phnCount + nPhns_syl
        spurtSylTimes[iterSyl, 1] = phnTimes[phnCount-1-1, 1]
    return spurtSylTimes

In [None]:
def get_spurt_word_times(path_indices, syls_word, spurtSylTimes):
    spurtWordTimes = np.zeros((len(path_indices), 2))
    sylIdx = 1
    print("this = ", len(syls_word[0]))

    for iterWordTimes in range(0, len(syls_word[0])):
        spurtWordTimes[iterWordTimes, 0] = spurtSylTimes[sylIdx-1, 0]
        sylIdx = sylIdx + syls_word[0][iterWordTimes].astype(int)
        spurtWordTimes[iterWordTimes, 1] = spurtSylTimes[sylIdx-1-1, 1]
    length_spurtWordTimes = iterWordTimes + 1
    return spurtWordTimes

In [None]:
def process_word_boundaries(spurtWordTimes, words, spurtSylTimes):
    # Processing word boundary file
    # FILE READ DELETED HERE
    a = spurtWordTimes
    b = words
    if (len(a) is not len(b)):
        print("error")
    wordData = np.hstack((a, np.array([b], dtype='S32').T))

    # Extract first coloumn of wordData
    startWordTime = [row[0] for row in wordData]
    endWordTime = [row[1] for row in wordData]

    startWordFrame = np.round((np.subtract(np.array(startWordTime, dtype='float'), spurtSylTimes[0][0].astype(float))*100))
    endWordFrame = np.round((np.subtract(np.array(endWordTime, dtype='float'), spurtSylTimes[0][0].astype(float))*100) + 1)
    startWordFrame = np.append(startWordFrame, endWordFrame[-1])

    return startWordFrame, endWordFrame

In [None]:
def get_sylTCSSBC(sylSB, eng_full, sylSB_num, twin, t_sigma, swin, s_sigma, spurtStartTime, vowelStartTime, startWordFrame):
    # TCSSBC computation
    if len(sylSB) > sylSB_num:
        eng = spectral_selection(
            eng_full[np.subtract(sylSB, 1), :], sylSB_num)
    else:
        eng = eng_full[sylSB, :]

    t_cor = temporal_corr(eng, twin, t_sigma)
    s_cor = spectral_corr(t_cor)
    sylTCSSBC = smooth(s_cor, swin, s_sigma)
    sylTCSSBC = np.array([sylTCSSBC])

    start_idx = np.round(spurtStartTime[0]*100).astype(int)
    sylTCSSBC = np.array([sylTCSSBC[0][start_idx:-1]])

    sylTCSSBC = np.divide(sylTCSSBC, max(sylTCSSBC[0]))

    if len(vowelSB) > vwlSB_num:
        eng = spectral_selection(
            eng_full[np.subtract(vowelSB, 1), :], vwlSB_num)
    else:
        eng = eng_full[vowelSB, :]
    t_cor = temporal_corr(eng, twin, t_sigma)
    s_cor = spectral_corr(t_cor)
    vwlTCSSBC = smooth(s_cor, swin, s_sigma)

    vwlTCSSBC = np.array([vwlTCSSBC])

    # Modify TCSSBC contour by clipping from the vowel start
    start_idx = np.round(vowelStartTime[0][0]*100).astype(int)
    vwlTCSSBC = np.array([vwlTCSSBC[0][start_idx:-1]])

    vwlTCSSBC = np.divide(vwlTCSSBC, max(vwlTCSSBC[0]))

    # Compute silence statistics
    # Preprocessing of the data
    word_duration = np.zeros((1, len(startWordFrame) - 1))
    word_Sylsum = np.zeros((1, len(startWordFrame) - 1))
    word_Vwlsum = np.zeros((1, len(startWordFrame) - 1))

    for j in range(0, len(startWordFrame) - 1):
        temp_start = startWordFrame[j].astype(int)
        temp_end = startWordFrame[j + 1].astype(int) - 1
        # jhansi
        if (temp_end >= sylTCSSBC.shape[1]):
            temp_end1 = sylTCSSBC.shape[1]-1
            sylTCSSBC[0, np.arange(temp_start, temp_end1)] = medfilt(
                sylTCSSBC[0, np.arange(temp_start, temp_end1)], 3)
            sylTCSSBC[0, temp_start] = sylTCSSBC[0, temp_start+1]
            sylTCSSBC[0, temp_end1] = sylTCSSBC[0, temp_end1 - 1]
            tempArr = sylTCSSBC[0, np.arange(temp_start, temp_end1)]
            word_Sylsum[0, j] = tempArr.sum(axis=0)
        else:
            sylTCSSBC[0, np.arange(temp_start, temp_end)] = medfilt(
                sylTCSSBC[0, np.arange(temp_start, temp_end)], 3)
            sylTCSSBC[0, temp_start] = sylTCSSBC[0, temp_start+1]
            sylTCSSBC[0, temp_end] = sylTCSSBC[0, temp_end - 1]
            tempArr = sylTCSSBC[0, np.arange(temp_start, temp_end)]
            word_Sylsum[0, j] = tempArr.sum(axis=0)
        if (temp_end >= vwlTCSSBC.shape[1]):
            temp_end = vwlTCSSBC.shape[1]-1

        #    temp_end = np.min([temp_end,len(vwlTCSSBC)])
        vwlTCSSBC[0, np.arange(temp_start, temp_end)] = medfilt(
            vwlTCSSBC[0, np.arange(temp_start, temp_end)], 3)
        vwlTCSSBC[0, temp_start] = vwlTCSSBC[0, temp_start+1]
        vwlTCSSBC[0, temp_end] = vwlTCSSBC[0, temp_end - 1]

        word_duration[0, j] = temp_end - temp_start + 1

        tempArr = vwlTCSSBC[0, np.arange(temp_start, temp_end)]
        word_Vwlsum[0, j] = tempArr.sum(axis=0)

    sylTCSSBC[np.isnan(sylTCSSBC)] = 0   # Feature vector 1
    vwlTCSSBC[np.isnan(vwlTCSSBC)] = 0   # Feature vector 2
    return sylTCSSBC[0]

In [None]:
def chunk_feature_contour(sylTCSSBC, spurtStartFrame, spurtEndFrame):
    # Chunking the feature contour
    sylTCSSBC_chunk = []
    for i in range(0, len(spurtStartFrame)):
        sylTCSSBC_chunk.append(sylTCSSBC[int(spurtStartFrame[i]):int(spurtEndFrame[i])])
    return np.array(sylTCSSBC_chunk)

In [None]:
def feature_contour(wav_file, test_data):
    file_name = wav_file[:-4]
    phn_file = phn_dir + file_name + ".txt"
    mat_file = stressLabelspath + file_name + ".mat"

    

    if not os.path.exists(phn_file):
        print("phn file doesn't exist")
        return None, None, False
    
    if not os.path.exists(mat_file):
        print("mat file doesn't exist")
        return None, None, False

    data_array = get_data_array(phn_file)
    phones, phn_times = get_phone_data(data_array)
    vowel, vowel_start_time, vowel_end_time = get_vowel_data(data_array)
    words = get_words(file_name)
    word_syls = get_word_syls(words)
    
    
    path_indices, currTestWordSyls = get_path_indices(words, word_syls, phones)

    if path_indices == None:
        return None, None, False
    syls_word, spurtSyl, spurtWordTimes = get_syls_count(path_indices, currTestWordSyls, words, word_syls)
    spurtSylTimes = get_spurts(spurtSyl, currTestWordSyls, phn_times)
    syls_word = syls_word.astype('i')

    poly = []
    for i in range(len(syls_word[0])):
        if syls_word[0][i] > 1:
            lst = [True] * syls_word[0][i]
            poly.extend(lst)
        else:
            poly.append(False)

    num_poly = sum(poly)

    if num_poly == 0:
        return None, None, False

    # path_indices = [path_indices[i] for i in poly]
    # syls_word = [syls_word[i] for i in poly]
    # spurtSylTimes = [spurtSylTimes[i] for i in poly]
    # words = [words[i] for i in poly]
    syls_word = [[syls_word[0][i] for i in range(len(syls_word[0])) if poly[i]]]
    path_indices = [path_indices[i] for i in range(len(path_indices)) if poly[i]]
    spurtSylTimes = [spurtSylTimes[i] for i in range(len(spurtSylTimes)) if poly[i]]
    spurtSylTimes = [list(i) for i in zip(*spurtSylTimes)]
    spurtSylTimes = np.array(spurtSylTimes)
    words = [words[i] for i in range(len(words)) if poly[i]]
    vowel_start_time = np.array([[vowel_start_time[0][i] for i in range(len(vowel_start_time[0])) if poly[i]]])
    vowel_end_time = np.array([[vowel_end_time[0][i] for i in range(len(vowel_end_time[0])) if poly[i]]])
    print(syls_word)
    print(words)


    spurtWordTimes = get_spurt_word_times(path_indices, syls_word, spurtSylTimes)

    # Execute the vocoder [MODIFICATION]: Get the audio file back so that it can be stored in a text file for C code.
    file_dir = ger_test_dir if test_data else ger_train_dir
    eng_full, xx = vocoder_func(file_dir + wav_file)
    eng_full = eng_full.conj().transpose()


    startWordFrame, endWordFrame = process_word_boundaries(spurtWordTimes, words, spurtSylTimes)

    # Processing of stress and syllable boundary file
    spurtSylTime = spurtSylTimes
    spurtStartTime = spurtSylTime[:][0]
    spurtEndTime = spurtSylTime[:][1]
    spurtStartFrame = np.round((spurtStartTime - spurtStartTime[0]) * 100)
    spurtEndFrame = np.round((spurtEndTime - spurtStartTime[0]) * 100)

    # Processing of Vowel boundary file
    vowel_start_time = vowel_start_time.astype(float)
    vowel_end_time = vowel_end_time.astype(float)

    vowelStartFrame = np.round(vowel_start_time*100 - spurtStartTime[0]*100)
    vowelEndFrame = np.round(vowel_end_time*100 - spurtStartTime[0]*100)

    sylTCSSBC = get_sylTCSSBC(sylSB, eng_full, sylSB_num, twin, t_sigma, swin, s_sigma, spurtStartTime, vowel_start_time, startWordFrame)
    sylTCSSBC_chunk = chunk_feature_contour(sylTCSSBC, spurtStartFrame, spurtEndFrame)

    # if test_data:
        # return sylTCSSBC_chunk, None


    # extract label
    mat = scipy.io.loadmat(stressLabelspath + file_name + '.mat')
    lab = mat['spurtStress']
    lab_list = lab.tolist()
    labels = get_labels_seq2seq(lab_list)  # Labels

    if len(sylTCSSBC_chunk) != len(labels):
        return None, None, False

    return sylTCSSBC_chunk, labels, True

In [None]:
ger_train_files_subset = ger_train_files[3:5]

In [None]:
all_contours = []
all_labels = []

for i, file in enumerate(ger_train_files_subset):
    contours, labels, valid = feature_contour(file, False)
    if not valid: continue

    all_contours.extend(contours)
    all_labels.extend(labels)

[[2]]
['centre']
this =  1


In [None]:
# train data
all_chunks = []
all_labels = []
for i, file in enumerate(ger_train_files):
    contours, labels, success = feature_contour(file, False)
    if contours == None:
        continue
    all_chunks.extend(contours)
    all_labels.extend(labels)   

    # clear_output(wait=True)
    print("Progress: {:.2f}%".format((i+1)/len(ger_train_files)*100), end="\r")
    print("Processed: {}/{}".format(i+1, len(ger_train_files)), end="\r")

[[2]]
['centre']
this =  1
[[3]]
['tomorrow']
this =  1


IndexError: index 2 is out of bounds for axis 0 with size 2

In [None]:
# train data
all_chunks = []
all_labels = []
start = 4
for i, file in enumerate(ger_train_files):
    contours, labels, success = feature_contour(file, False)
    if contours == None:
        continue
    all_chunks.append(contours)
    all_labels.append(labels)   

    # clear_output(wait=True)
    print("Progress: {:.2f}%".format((i+1)/len(ger_train_files)*100), end="\r")
    print("Processed: {}/{}".format(i+1, len(ger_train_files)), end="\r")

[[2]]
['centre']
this =  1
[[3]]
['tomorrow']
this =  1


IndexError: index 2 is out of bounds for axis 0 with size 2

In [None]:
# train data
all_chunks = []
all_labels = []
for i, file in enumerate(ita_train_files):
    chunks, labels = feature_contour(file, False)
    if chunks == None:
        continue
    all_chunks.extend(chunks)
    all_labels.extend(labels)

    print("Progress: {:.2f}%".format((i+1)/len(ita_train_files)*100), end="\r")
    print("Processed: {}/{}".format(i+1, len(ita_train_files)), end="\r")
    clear_output(wait=True)

ValueError: too many values to unpack (expected 2)

In [None]:
# save train data as pickle
df = pd.DataFrame({'contour': all_chunks, 'labels': all_labels})
df.to_pickle('../saved/ger_train_poly.pkl')

In [None]:
# test data
test_chunks = []
test_labels = []

for i, file in enumerate(ger_test_files):
    chunks, labels, success = feature_contour(file, True)
    if not success:
        continue
    test_chunks.extend(chunks)
    test_labels.extend(labels)

    print("Progress: {:.2f}%".format((i+1)/len(ger_test_files)*100), end="\r")
    print("Processed: {}/{}".format(i+1, len(ger_test_files)), end="\r")
    clear_output(wait=True)

Progress: 99.94%Processed: 1767/1768

In [None]:
# save train data as pickle
df = pd.DataFrame({'contour': test_chunks, 'labels': test_labels})
df.to_pickle('../saved/ger_test_poly.pkl')

In [None]:
# test data
test_chunks = []
test_labels = []

for i, file in enumerate(ger_test_files):
    chunks, labels, success = feature_contour(file, True)
    if not success:
        continue
    test_chunks.extend(chunks)
    test_labels.extend(labels)

    print("Progress: {:.2f}%".format((i+1)/len(ita_test_files)*100), end="\r")
    print("Processed: {}/{}".format(i+1, len(ita_test_files)), end="\r")
    clear_output(wait=True)

Progress: 99.70%Processed: 1685/1690

In [None]:
# save train data as pickle
df = pd.DataFrame({'contour': test_chunks, 'labels': test_labels})
df.to_pickle('ita_test.pkl')

In [None]:
print(len(test_chunks))

12524


In [None]:
# make a dataframe
df = pd.DataFrame({'contour': all_chunks, 'labels': all_labels})
df.head()

Unnamed: 0,chunks,labels
0,"[0.01659550862520646, 0.01659550862520646, 0.0...",1
1,"[0.20705475938828283, 0.20705475938828283, 0.1...",1
2,"[0.0853309212266154, 0.0853309212266154, 0.085...",1
3,"[0.017483717247693344, 0.017483717247693344, 0...",1
4,"[0.015928296040650866, 0.015928296040650866, 0...",1


In [None]:
# save the dataframe
df.to_pickle('data.pkl')

In [None]:
# load the dataframe
df2 = pd.read_pickle('data.pkl')

In [None]:
df = pd.DataFrame({'contour': test_chunks})
df.head()

Unnamed: 0,chunks
0,"[0.013564021110304844, 0.013564021110304844, 0..."
1,"[0.4396198379683321, 0.4396198379683321, 0.280..."
2,"[0.08664379438552769, 0.08664379438552769, 0.1..."
3,"[0.05106373959591337, 0.05106373959591337, 0.0..."
4,"[0.011421340988226685, 0.011421340988226685, 0..."


In [None]:
df.to_pickle('test.pkl')