In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import plistlib
from typing import Union, List, Dict, Tuple, Iterable
import nltk
from pathlib import Path
from scipy.io import wavfile
import tensorflow_io as tfio
from pydub import AudioSegment
from tqdm.notebook import tqdm
from IPython import display
import dill

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/Users/jackwang/.local/share/virtualenvs/Code-_CZGGnvj/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ["dlopen(/Users/jackwang/.local/share/virtualenvs/Code-_CZGGnvj/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 0x0006): tried: '/Users/jackwang/.local/share/virtualenvs/Code-_CZGGnvj/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Users/jackwang/.local/share/virtualenvs/Code-_CZGGnvj/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/Users/jackwang/.local/share/virtualenvs/Code-_CZGGnvj/lib/python3.9/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file)"]


In [2]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [3]:
# A function take two lists of words in sequence as input and return using The Longest Common Subsequence algorithm
def lcss(ref_text: List[Dict], student_input: List[Dict], prob: float = 0.1, stemmer: nltk.stem.api = None) ->  Tuple[List[Tuple[Dict, str]],List[int],List[int]]:
    """
    A function take two lists of words in sequence as input and return using The Longest Common Subsequence algorithm

    :param ref_text: a list of words in sequence that each element is a dict with key must contain as tString and
    tConfidence

    :param student_input: a list of words in sequence that each element is a dict with key must contain
    as tString and tConfidence

    :param stemmer: a stemmer object from nltk.Stem
    :param prob: a float number that is the threshold of confidence

    :return: a list of tuple of (dict, status) where dict is a dict with key contain as
    tString and tConfidence
    """
    stemming = stemmer is not None
    lengths = [[0 for j in range(len(student_input) + 1)] for i in range(len(ref_text) + 1)]
    # row 0 and column 0 are initialized to 0 already
    for i, rec_x in enumerate(ref_text):
        x = (stemmer.stem(rec_x['tString']) if stemming else rec_x['tString']).lower()
        for j, rec_y in enumerate(student_input):
            y = (stemmer.stem(rec_y['tString']) if stemming else rec_y['tString']).lower()
            if x == y and rec_y['tConfidence'] >= prob:
                lengths[i + 1][j + 1] = lengths[i][j] + 1
            else:
                lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
    # read the substring out from the matrix
    result = []
    pos_x, pos_y = [], []
    x, y = len(ref_text), len(student_input)
    while x != 0 and y != 0:
        if lengths[x][y] == lengths[x - 1][y]:
            result.append((ref_text[x - 1], "R"))
            x -= 1
        elif lengths[x][y] == lengths[x][y - 1]:
            result.append((student_input[y - 1], "A"))
            y -= 1
        else:
            try:
                if not stemming:
                    assert ref_text[x - 1]['tString'].lower() == student_input[y - 1]['tString'].lower()
                else:
                    assert stemmer.stem(ref_text[x - 1]['tString'].lower()) == stemmer.stem(student_input[y - 1]['tString'].lower())
            except AssertionError:
                print("Error: ", ref_text[x - 1]['tString'].lower(), student_input[y - 1]['tString'].lower(), ref_text[x - 1]['tString'].lower()== student_input[y - 1]['tString'].lower() )
                raise AssertionError
            result.append((student_input[y - 1], "M"))
            pos_x.append(x - 1)
            pos_y.append(y - 1)
            x -= 1
            y -= 1
    return result[::-1], pos_x[::-1], pos_y[::-1]

In [4]:
def read_plist(path: str) -> Tuple[List[Dict], str]:
    """
    Read a plist file and return a list of dictionaries and the name of the file without the extension

    :param path:
    Path str to the plist file

    :return: a tuple of (list of dictionaries, file name) that dictionary is word information that student read
    """
    with open(path, 'rb') as f:
        data = plistlib.load(f)
    return data, path.split('/')[-1].split('.')[0]

In [5]:
def parse_files_name(string: str) -> Dict:
    """
    Read a file name and return a dict with key as student_id, passage_id and random_number
    :param string: file name with schema student_{student_id}_passage_{passage_id}_{random_number}
    :return: dict with key as student_id, passage_id and random_number
    """
    student_id = string.split('_')[1]
    passage_id = int(string.split('_')[3]) % 100000
    random_number = string.split('_')[4]
    return {'student_id': student_id, 'passage_id': passage_id, 'random_number': random_number}

In [6]:
def serialize_example(recordName, audioSegment, sample_rate, sentence, wordStart, wordDuration, matchSegment, matchReference):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
        'RecordName': _bytes_feature(tf.io.serialize_tensor(recordName)),
        'AudioSegment': _bytes_feature(tf.io.serialize_tensor(audioSegment)),
        'SampleRate': _int64_feature(sample_rate),
        'Sentence': _bytes_feature(tf.io.serialize_tensor(sentence)),
        'WordStart': _bytes_feature(tf.io.serialize_tensor(wordStart)),
        'WordDuration': _bytes_feature(tf.io.serialize_tensor(wordDuration)),
        'MatchSegment': _bytes_feature(tf.io.serialize_tensor(matchSegment)),
        'MatchReference': _bytes_feature(tf.io.serialize_tensor(matchReference)),
    }

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [7]:
# Read file using dill
with open('./unique_word.dill', 'rb') as f:
    data = dill.load(f)
data.add('')
# data is a set convert it to numpy stored str array
data_dict = tf.convert_to_tensor(sorted(list(data)))

Metal device set to: Apple M1 Max


2022-12-03 13:23:45.363812: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-03 13:23:45.363926: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
p = Path('../Result')
p2 = Path('../SiriR/SiriV1')
available_Count = [1,2,4,5]

In [9]:
from functools import reduce
students = set(map(lambda item: int(parse_files_name(str(item.stem))['student_id']), list(p.glob('*.plist'))))
students = sorted(list(students))

In [10]:
split_point = students[round(len(students) * 0.8)]
print("The student id that split the train and test set is: ", split_point)

The student id that split the train and test set is:  887


In [11]:
# Generate a Tensorflow Record File
with tf.io.TFRecordWriter('./Student_Answer_Record_Train.tfrecord', options=tf.io.TFRecordOptions(compression_type='GZIP')) as writerTrain:
    with tf.io.TFRecordWriter('./Student_Answer_Record_Eval.tfrecord', options=tf.io.TFRecordOptions(compression_type='GZIP')) as writerEval:
        for file in tqdm(p.glob('*.plist')):
            record, name = read_plist(str(file))
            #Read student wave file by name path under ../Audio
            try:
                audioHandle = AudioSegment.from_file('../Audio/{}.wav'.format(name), format='wav')
                #Save audio files as numpy convert to float and normalize it using np.int16
                # audioSegment = np.array(audioHandle.get_array_of_samples()).astype(np.float32) / np.iinfo(np.int16).max
                audioRawArray = audioHandle.get_array_of_samples()
                audioSegment = tf.constant(audioRawArray, dtype=tf.int16)
                sample_rate = audioHandle.frame_rate
            except Exception as e:
                print(name)
                print(e)
                continue

            name_info = parse_files_name(name)
            try:
                matchSegments = []
                matchReferences = []
                for ii in available_Count:
                    record2, name2 = read_plist(str(p2.parent / 'SiriV{}'.format(ii) / '{}.plist'.format(name_info['passage_id'])))
                    try:
                        T, pRecord2, pRecord=lcss(record2, record, stemmer=None)
                    except:
                        print(name)
                        raise Exception
                    pairs = list(zip(pRecord2, pRecord))
                    numpy_pairs = np.array(pairs).astype(np.int64)
                    try:
                        matchSegments.append(numpy_pairs[:, 1])
                        matchReferences.append(numpy_pairs[:, 0])
                    except IndexError:
                        assert len(pairs) == 0
                        raise IndexError
            except IndexError:
                assert len(pairs) == 0
                print(name)
                continue
            #pad matchSegments to same length using keras pad_sequence
            matchSegment = tf.keras.preprocessing.sequence.pad_sequences(matchSegments, padding='post',value=-1, dtype=np.int64) + 1
            matchReference = tf.keras.preprocessing.sequence.pad_sequences(matchReferences, padding='post',value=-1, dtype=np.int64) + 1

            recordName = name
            dataframe = pd.DataFrame(record)
            wordStart = dataframe['tTime'].values.astype(np.float32)
            #pad a -1 at front
            wordStart = np.insert(wordStart, 0, -1)
            wordDuration = dataframe['tDuration'].values.astype(np.float32)
            #pad a -1 at front
            wordDuration = np.insert(wordDuration, 0, -1)
            stemmer = nltk.stem.LancasterStemmer()
            stemmed_map = map(lambda x: stemmer.stem(x), dataframe['tString'])
            #Generate Sparse index using location of data_dict
            sparse_index = map(lambda x: x == data_dict, stemmed_map)
            #Convert to dense index from sparse index using argMax
            dense_index = map(lambda x: np.argmax(x), sparse_index)
            #convert to int tensor
            dense_index = tf.constant(list(dense_index), dtype=tf.int64)
            #pad a zero at front
            sentence = tf.pad(dense_index, [[1, 0]], constant_values=-1)
            #Serialize the example
            example = serialize_example(recordName, audioSegment, sample_rate, sentence, wordStart, wordDuration, matchSegment, matchReference)
            if int(name_info['student_id']) <= split_point:
                writerTrain.write(example)
            else:
                writerEval.write(example)

0it [00:00, ?it/s]

student_921_passage_241174_56e850df4deb3
student_137_passage_44000_553fb722737c1
student_373_passage_233033_56e1b4b6872cf
student_571_passage_22006_553aa46db5128
student_19_passage_21002_5539d7e23cfdc
student_818_passage_32023_553fb50ad4eee
student_735_passage_241054_56df08bc62d3d
student_818_passage_32023_553fb5ba447c9
student_824_passage_32017_5548f520b2227
student_587_passage_21075_553a6754d2c8f
student_203_passage_221159_56e6f1d3a9529
student_113_passage_222084_56e6fd8194d91
student_123_passage_221146_56e6e6c3d8961
student_309_passage_222077_56e6eb5908028
student_605_passage_232062_56ddc99fed6e4
student_373_passage_41162_553e7ede2ebcf
student_473_passage_21010_553aa022ad0f1
student_641_passage_231119_56e1cd00b2c3a
student_309_passage_222084_56e6ea079ff21
student_13_passage_221140_56e702b977999
student_988_passage_31032_553fee9fb25c4
student_575_passage_21051_553a6af7668b2
student_581_passage_21037_553aa14e53972
student_669_passage_231110_56ddb867b8a7e
student_133_passage_222089_56d

In [12]:
# Process Reference under Siri Folder Audio and save it as a single serialized files
p = Path('../Siri/')
available_Count = [1,2,4,5]
#Get all m4a files under SiriR/SiriV1 and return passage id
available_Sample = map(lambda x: int(x.stem) % 100000, p.glob('SiriV1/*.m4a'))
#Use a loop to get all Audios using passage id and available count, different sample storage in different folder with name SiriV1, SiriV2, SiriV3, SiriV4, SiriV5, the available count defined in available_Count
for sample in tqdm(available_Sample):
    audio_names = map(lambda x: str(p / 'SiriV{}/{}.m4a'.format(x, sample)), available_Count)
    audio_handles = map(lambda x: AudioSegment.from_file(x, format='m4a'), audio_names)
    #get audio segment in original format
    audio_segments = map(lambda x: np.array(x.get_array_of_samples()), audio_handles)
    #paddding audio segment to the same length and stack it in to a tensor
    audio_segments = tf.constant(tf.keras.preprocessing.sequence.pad_sequences(list(audio_segments), padding='post', value=0, dtype=np.int16))
    audio_segment = tf.stack(list(audio_segments))
    #save the tensor to disk under Siri_Reference_Sample folder create the folder if not exist
    tf.io.write_file('./Siri_Reference_Sample/{}.tfs'.format(sample), tf.io.serialize_tensor(audio_segment))

0it [00:00, ?it/s]

In [13]:
#create the parser function to parse the serialized generated above
def parse_function(serialized_example):
    # Define a dict with the data-names and types we expect to find in the
    # serialized example.
    features = {
        'RecordName': tf.io.FixedLenFeature([], tf.string),
        'AudioSegment': tf.io.FixedLenFeature([], tf.string),
        'SampleRate': tf.io.FixedLenFeature([], tf.int64),
        'Sentence': tf.io.FixedLenFeature([], tf.string),
        'WordStart': tf.io.FixedLenFeature([], tf.string),
        'WordDuration': tf.io.FixedLenFeature([], tf.string),
        'MatchSegment': tf.io.FixedLenFeature([], tf.string),
        'MatchReference': tf.io.FixedLenFeature([], tf.string),
    }
    # Parse the input tf.Example proto using the dictionary above.
    e = tf.io.parse_single_example(serialized_example, features)
    #Convert the serialized tensor to tensor
    e['AudioSegment'] = tf.io.parse_tensor(e['AudioSegment'], out_type=tf.int16)
    e['Sentence'] = tf.io.parse_tensor(e['Sentence'], out_type=tf.string)
    e['WordStart'] = tf.io.parse_tensor(e['WordStart'], out_type=tf.float32)
    e['WordDuration'] = tf.io.parse_tensor(e['WordDuration'], out_type=tf.float32)
    e['MatchSegment'] = tf.io.parse_tensor(e['MatchSegment'], out_type=tf.int64)
    e['MatchReference'] = tf.io.parse_tensor(e['MatchReference'], out_type=tf.int64)
    return e

In [14]:
#Create the dataset by tfrecord file generated above
dataset = tf.data.TFRecordDataset('./Student_Answer_Record.tfrecord', compression_type='GZIP')
dataset = dataset.map(parse_function)

In [15]:
# processed_passage = set()
# for item in dataset:
#     passage_id = parse_files_name(item['RecordName'].numpy().decode('utf-8'))['passage_id']
#     # if passage id exist skip to next item
#     if passage_id in processed_passage:
#         continue
#     processed_passage.add(passage_id)
#     sent = list(item['Sentence'].numpy())
#     decoded_map = map(lambda x: x.decode('utf-8'), sent)
#     #Stem data using Lancaster Stemmer
#     stemmer = nltk.stem.LancasterStemmer()
#     stemmed_map = map(lambda x: stemmer.stem(x), decoded_map)
#     #Generate Sparse index using location of data_dict
#     sparse_index = map(lambda x: x == data_dict, stemmed_map)
#     #Convert to dense index from sparse index using argMax
#     dense_index = map(lambda x: np.argmax(x), sparse_index)
#     #convert to int tensor
#     dense_index = tf.constant(list(dense_index), dtype=tf.int64)
#     #save dense index to disk using passage id as file name
#     tf.io.write_file('./Siri_Dense_Index/{}.tfs'.format(passage_id), tf.io.serialize_tensor(dense_index))

In [21]:
# iterate over path p2 and save stem sentence to disk
for passage_id in tqdm(map(lambda x: int(x.stem) ,p2.glob('*.plist'))):
    times = []
    words = []
    for i in available_Count:
        path = p2.parent / 'SiriV{}'.format(i) / '{}.plist'.format(passage_id)
        #read and parse plist file
        record, name = read_plist(str(path))
        #convert dict record to pd dDataFrame
        dataframe = pd.DataFrame(record)
        #get passage id from name
        #Stem data using Lancaster Stemmer
        stemmer = nltk.stem.LancasterStemmer()
        stemmed_map = map(lambda x: stemmer.stem(x), dataframe['tString'])
        #Generate Sparse index using location of data_dict
        sparse_index = map(lambda x: x == data_dict, stemmed_map)
        #Convert to dense index from sparse index using argMax
        dense_index = map(lambda x: np.argmax(x), sparse_index)
        #convert to int tensor
        dense_index = tf.constant(list(dense_index), dtype=tf.int64)
        #Reference stack starting point and period
        start = dataframe['tTime'].to_numpy().astype(np.float32)
        period = dataframe['tDuration'].to_numpy().astype(np.float32)
        total = tf.stack([start, period], axis=-1)
        times.append(total)
        words.append(dense_index)
    #stack all the reference
    #paddding audio segment to the same length and stack it in to a tensor
    times = tf.constant(tf.keras.preprocessing.sequence.pad_sequences(times, padding='post',value=-1, dtype=np.float32))
    times = tf.stack(times, axis=0)
    #pad a -1 to begin of second axis to indicate the start of the slot times has shape (4, None, 2)
    times = tf.pad(times, [[0,0],[1,0],[0,0]], constant_values=-1)
    words = tf.constant(tf.keras.preprocessing.sequence.pad_sequences(words, padding='post',value=-1, dtype=np.int64))
    words = tf.stack(words, axis=0)
    #pad a -1 to words to indicate the start of the slot word has shape (4, None) pad to second axis
    words = tf.pad(words, [[0,0],[1,0]], constant_values=-1)
    #save total to disk using passage id as file name
    tf.io.write_file('./Siri_Dense_Index/{}_ref.tfs'.format(passage_id), tf.io.serialize_tensor(times))
    #save dense index to disk using passage id as file name
    tf.io.write_file('./Siri_Dense_Index/{}_word.tfs'.format(passage_id), tf.io.serialize_tensor(words))

0it [00:00, ?it/s]

In [None]:
te = tf.io.parse_tensor(tf.io.read_file('./Siri_Dense_Index/{}_ref.tfs'.format(41166)), out_type=tf.float32)
tw = tf.io.parse_tensor(tf.io.read_file('./Siri_Dense_Index/{}_word.tfs'.format(41166)), out_type=tf.int64)

In [None]:
m = tf.convert_to_tensor(matchReference)

In [None]:
tf.gather(tw, m, batch_dims=1)

In [None]:
tw