In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import plistlib
from typing import Union, List, Dict, Tuple, Iterable
import nltk
from pathlib import Path
from scipy.io import wavfile
import tensorflow_io as tfio
from pydub import AudioSegment
from tqdm.notebook import tqdm
from IPython import display
import dill

In [None]:
path_segment_plist = Path('../../DataFolder/Student_Response/Result')

path_segment_tensor = Path('../../DataFolder/Student_Response/Segment_Tensor')

In [None]:
def read_plist(path: str) -> Tuple[List[Dict], str]:
    """
    Read a plist file and return a list of dictionaries and the name of the file without the extension

    :param path:
    Path str to the plist file

    :return: a tuple of (list of dictionaries, file name) that dictionary is word information that student read
    """
    with open(path, 'rb') as f:
        data = plistlib.load(f)
    return data, path.split('/')[-1].split('.')[0]

def parse_files_name(string: str) -> Dict:
    """
    Read a file name and return a dict with key as student_id, passage_id and random_number
    :param string: file name with schema student_{student_id}_passage_{passage_id}_{random_number}
    :return: dict with key as student_id, passage_id and random_number
    """
    student_id = string.split('_')[1]
    passage_id = int(string.split('_')[3]) % 100000
    random_number = string.split('_')[4]
    return {'student_id': student_id, 'passage_id': passage_id, 'random_number': random_number}

In [None]:
# Read all wav files under folder of Student Response
file_list = path_segment_plist.glob("*.plist")

In [None]:
for C in file_list:
    record, name = read_plist(str(C))
    dataframe = pd.DataFrame(record)
    start = dataframe['tTime'].to_numpy().astype(np.float32)
    period = dataframe['tDuration'].to_numpy().astype(np.float32)
    total = tf.stack([start, period], axis=-1)
    tfs = tf.io.serialize_tensor(total)
    tf.io.write_file(str(path_segment_tensor / (name + '.tfs')), tfs)

In [124]:
seg_str = str(path_segment_tensor)

In [134]:
def parse_function(serialized_example: tf.string) -> Dict:
    # Define a dict with the data-names and types we expect to find in the
    # serialized example.
    features = {
        'RecordName': tf.io.FixedLenFeature([], tf.string),
        'AudioSegment': tf.io.FixedLenFeature([], tf.string),
        'SampleRate': tf.io.FixedLenFeature([], tf.int64),
        'Sentence': tf.io.FixedLenFeature([], tf.string),
        'WordStart': tf.io.FixedLenFeature([], tf.string),
        'WordDuration': tf.io.FixedLenFeature([], tf.string),
        'MatchSegment': tf.io.FixedLenFeature([], tf.string),
        'MatchReference': tf.io.FixedLenFeature([], tf.string),
    }
    # Parse the input tf.Example proto using the dictionary above.
    e = tf.io.parse_single_example(serialized_example, features)
    ret = {'AudioSegment': tf.io.parse_tensor(e['AudioSegment'], out_type=tf.int16),
           'RecordName' : tf.io.parse_tensor(e['RecordName'], tf.string)}
    # Convert the serialized tensor to tensor

    passage_id = tf.strings.split(e['RecordName'], sep='_')[3]
    # convert tf.string to int
    passage_id = tf.strings.to_number(passage_id, out_type=tf.int32) % 100000
    # convert to tf.string
    ret['passage_id'] = tf.strings.as_string(passage_id)
    ret['label'] = tf.io.parse_tensor(tf.io.read_file(seg_str + '/' + ret['RecordName'] + '.tfs'), tf.float32)

    return ret

In [135]:
tfds = tf.data.TFRecordDataset('../../DataFolder/Tensorflow_DataRecord/Student_Answer_Record_Eval.tfrecord', compression_type='GZIP').map(parse_function)

In [136]:
tfds.take(1).get_single_element()

{'AudioSegment': <tf.Tensor: shape=(317184,), dtype=int16, numpy=array([   0,    0,    0, ..., -699, -534, -616], dtype=int16)>,
 'RecordName': <tf.Tensor: shape=(), dtype=string, numpy=b'student_959_passage_241163_56e9bef67d1e4'>,
 'passage_id': <tf.Tensor: shape=(), dtype=string, numpy=b'41163'>,
 'label': <tf.Tensor: shape=(27, 2), dtype=float32, numpy=
 array([[ 1.98,  0.44],
        [ 2.42,  0.31],
        [ 2.73,  0.23],
        [ 2.96,  0.46],
        [ 3.57,  0.54],
        [ 4.11,  0.36],
        [ 4.47,  0.21],
        [ 4.68,  0.68],
        [ 5.46,  0.36],
        [ 5.82,  0.72],
        [ 6.54,  0.39],
        [ 6.93,  0.36],
        [ 7.29,  0.19],
        [ 7.48,  0.35],
        [ 7.83,  0.21],
        [ 8.04,  0.11],
        [ 8.15,  0.55],
        [ 8.7 ,  0.24],
        [ 8.94,  0.21],
        [ 9.15,  0.24],
        [ 9.39,  0.6 ],
        [10.05,  0.18],
        [10.23,  0.43],
        [10.66,  0.38],
        [11.04,  0.44],
        [12.06,  0.69],
        [12.79,  

In [None]:
it = iter(tfds)
next(it)

In [None]:
c = next(it)

In [117]:
tf.io.parse_tensor(c['RecordName'], tf.string)

<tf.Tensor: shape=(), dtype=string, numpy=b'student_989_passage_241172_56e9939559cd9'>