# Use CMU-Multimodal-SDK to Obtain Dataset #

### First install the SDK by running: ###

```console
git clone https://github.com/CMU-MultiComp-Lab/CMU-MultimodalSDK.git

cd CMU-MultiModalSDK
pip install .
```


Change the `SDK_PATH` variable in `constants/paths.py` to reflect the directory you just created.

In [11]:
from constants import SDK_PATH, DATA_PATH, WORD_EMB_PATH, CACHE_PATH
import sys

if SDK_PATH is None:
    print("SDK path is not specified! Please specify first in constants/paths.py")
    exit(0)
else:
    sys.path.append(SDK_PATH)

import mmsdk
import os
import re
import numpy as np
from mmsdk import mmdatasdk as md
from subprocess import check_call, CalledProcessError

# create folders for storing the data
if not os.path.exists(DATA_PATH):
    check_call(' '.join(['mkdir', '-p', DATA_PATH]), shell=True)

# download highlevel features, low-level (raw) data and labels for the dataset MOSEI
# if the files are already present, instead of downloading it you just load it yourself.

DATASET = md.cmu_mosei

try:
    md.mmdataset(DATASET.highlevel, DATA_PATH)
except RuntimeError:
    print("High-level features have been downloaded previously.")

try:
    md.mmdataset(DATASET.raw, DATA_PATH)
except RuntimeError:
    print("Raw data have been downloaded previously.")
    
try:
    md.mmdataset(DATASET.labels, DATA_PATH)
except RuntimeError:
    print("Labels have been downloaded previously.")

[91m[1m[2024-04-20 13:35:54.126] | Error   | [0m./data/CMU_MOSEI_TimestampedWordVectors.csd file already exists ...
High-level features have been downloaded previously.
[94m[1m[2024-04-20 13:35:55.282] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSEI/language/CMU_MOSEI_TimestampedWords.csd to ./data/CMU_MOSEI_TimestampedWords.csd...


                                                                     

[92m[1m[2024-04-20 13:36:01.751] | Success | [0mDownload complete!
[92m[1m[2024-04-20 13:36:01.780] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWords.csd ...
[94m[1m[2024-04-20 13:36:01.975] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-04-20 13:36:01.975] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:02.614] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:02.614] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[94m[1m[2024-04-20 13:36:02.704] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSEI/language/CMU_MOSEI_TimestampedPhones.csd to ./data/CMU_MOSEI_TimestampedPhones.csd...


                                                                     

[92m[1m[2024-04-20 13:36:11.273] | Success | [0mDownload complete!
[92m[1m[2024-04-20 13:36:11.275] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedPhones.csd ...
[94m[1m[2024-04-20 13:36:11.322] | Status  | [0mChecking the integrity of the <phoneme> computational sequence ...
[94m[1m[2024-04-20 13:36:11.322] | Status  | [0mChecking the format of the data in <phoneme> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:12.007] | Success | [0m<phoneme> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:12.007] | Status  | [0mChecking the format of the metadata in <phoneme> computational sequence ...
[92m[1m[2024-04-20 13:36:12.007] | Success | [0mDataset initialized successfully ... 
[94m[1m[2024-04-20 13:36:12.143] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSEI/labels/CMU_MOSEI_Labels.csd to ./data/CMU_MOSEI_Labels.csd...


                                                                     

[92m[1m[2024-04-20 13:36:15.771] | Success | [0mDownload complete!
[92m[1m[2024-04-20 13:36:15.772] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_Labels.csd ...
[94m[1m[2024-04-20 13:36:15.816] | Status  | [0mChecking the integrity of the <All Labels> computational sequence ...
[94m[1m[2024-04-20 13:36:15.816] | Status  | [0mChecking the format of the data in <All Labels> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:16.288] | Success | [0m<All Labels> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:16.288] | Status  | [0mChecking the format of the metadata in <All Labels> computational sequence ...
[92m[1m[2024-04-20 13:36:16.288] | Success | [0mDataset initialized successfully ... 




In [12]:
# list the directory contents (should see .csd files which are computational sequences)
data_files = os.listdir(DATA_PATH)
print('\n'.join(data_files))

CMU_MOSEI_TimestampedWordVectors.csd
CMU_MOSEI_COVAREP.csd
CMU_MOSEI_TimestampedPhones.csd
CMU_MOSEI_TimestampedWords.csd
CMU_MOSEI_Labels.csd


## Load the Dataset: ##

Create a recipe (dictionary of the form `{feature_name: csd_path}`) and feed it into a `mmdataset` object

In [13]:
# define the modalities we want
acoustic_field = 'CMU_MOSEI_COVAREP'
text_field = 'CMU_MOSEI_TimestampedWords'


features = [
    text_field, 
    acoustic_field
]

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
dataset = md.mmdataset(recipe)

[92m[1m[2024-04-20 13:36:34.146] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWords.csd ...
[94m[1m[2024-04-20 13:36:34.534] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-04-20 13:36:34.534] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:35.105] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:35.105] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2024-04-20 13:36:35.105] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_COVAREP.csd ...
[94m[1m[2024-04-20 13:36:35.395] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2024-04-20 13:36:35.396] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:36.518] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:36.518] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2024-04-20 13:36:36.518] | Success | [0mDataset initialized successfully ... 




In [None]:
dataset[acoustic_field][some_id]

In [30]:
print(list(dataset.keys()))
print("=" * 80)

print(list(dataset[acoustic_field].keys())[:10])
print("=" * 80)

some_id = list(dataset[acoustic_field].keys())[15]
print(list(dataset[acoustic_field][some_id].keys()))
print("=" * 80)

print(list(dataset[acoustic_field][some_id]['intervals'].shape))
print("=" * 80)

print(list(dataset[acoustic_field][some_id]['features'].shape))
print(list(dataset[text_field][some_id]['features'].shape))

['CMU_MOSEI_TimestampedWords', 'CMU_MOSEI_COVAREP']
['--qXJuDtHPw[0]', '--qXJuDtHPw[1]', '--qXJuDtHPw[2]', '--qXJuDtHPw[3]', '--qXJuDtHPw[4]', '--qXJuDtHPw[5]', '--qXJuDtHPw[6]', '--qXJuDtHPw[7]', '--qXJuDtHPw[8]', '--qXJuDtHPw[9]']
['intervals', 'features']
[1, 2]
[1, 74]
[1, 1]


### Different modalities have different number of time steps. We need to align them to a single modality. ###

## Align the Dataset to a Single Modality (Words) ##

Create a `collapse_function` that averages the features so that all modalities can be aligned to a single shape

In [14]:
# Define a simple averaging function that does not depend on intervals
def avg(intervals: np.array, features: np.array) -> np.array:
    try:
        return np.average(features, axis=0)
    except:
        return features

# Align the dataset to the textual modality with averaging, collapse_function receives a list of functions
dataset.align(text_field, collapse_functions=[avg])

[94m[1m[2024-04-20 14:03:08.696] | Status  | [0mUnify was called ...
[92m[1m[2024-04-20 14:03:08.709] | Success | [0mUnify completed ...
[94m[1m[2024-04-20 14:03:08.709] | Status  | [0mPre-alignment based on <CMU_MOSEI_TimestampedWords> computational sequence started ...
[94m[1m[2024-04-20 14:06:10.002] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-04-20 14:06:15.346] | Status  | [0mAlignment starting ...


                                                                                                  

[92m[1m[2024-04-20 14:17:26.676] | Success | [0mAlignment to <CMU_MOSEI_TimestampedWords> complete.
[94m[1m[2024-04-20 14:17:26.677] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-04-20 14:17:27.152] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWords> computational sequence.
[94m[1m[2024-04-20 14:17:27.152] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWords> computational sequence ...


                                                                                          

[92m[1m[2024-04-20 14:17:31.086] | Success | [0m<CMU_MOSEI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-04-20 14:17:31.086] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWords> computational sequence ...
[92m[1m[2024-04-20 14:17:31.089] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2024-04-20 14:17:31.089] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                                           

[92m[1m[2024-04-20 14:17:31.846] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-04-20 14:17:31.846] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...


In [25]:
# check out what the keys look like now (there is an added segment no in the str key)
print(list(dataset[text_field].keys())[55])

--qXJuDtHPw[55]


##  Append Annotations (Labels) to the Dataset ##

In [31]:
label_field = 'CMU_MOSEI_Labels'

# add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)

[92m[1m[2024-04-20 14:28:56.200] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_Labels.csd ...
[94m[1m[2024-04-20 14:28:56.984] | Status  | [0mChecking the integrity of the <All Labels> computational sequence ...
[94m[1m[2024-04-20 14:28:56.984] | Status  | [0mChecking the format of the data in <All Labels> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 14:28:57.512] | Success | [0m<All Labels> computational sequence data in correct format.
[94m[1m[2024-04-20 14:28:57.512] | Status  | [0mChecking the format of the metadata in <All Labels> computational sequence ...
[94m[1m[2024-04-20 14:28:57.512] | Status  | [0mUnify was called ...
[92m[1m[2024-04-20 14:30:29.066] | Success | [0mUnify completed ...
[94m[1m[2024-04-20 14:30:29.120] | Status  | [0mPre-alignment based on <CMU_MOSEI_Labels> computational sequence started ...
[94m[1m[2024-04-20 14:30:30.936] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-04-20 14:30:32.209] | Status  | [0mPre-alignment done for <CMU_MOSEI_TimestampedWords> ...
[94m[1m[2024-04-20 14:30:32.234] | Status  | [0mAlignment starting ...


                                                                                                   

[92m[1m[2024-04-20 14:31:01.914] | Success | [0mAlignment to <CMU_MOSEI_Labels> complete.
[94m[1m[2024-04-20 14:31:01.914] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-04-20 14:31:03.172] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWords> computational sequence.
[94m[1m[2024-04-20 14:31:03.172] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWords> computational sequence ...


                                                                      

[92m[1m[2024-04-20 14:31:03.196] | Success | [0m<CMU_MOSEI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-04-20 14:31:03.196] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWords> computational sequence ...
[92m[1m[2024-04-20 14:31:03.196] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2024-04-20 14:31:03.196] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                      

[92m[1m[2024-04-20 14:31:03.217] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-04-20 14:31:03.217] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...
[92m[1m[2024-04-20 14:31:03.217] | Success | [0mInitialized empty <CMU_MOSEI_Labels> computational sequence.
[94m[1m[2024-04-20 14:31:03.217] | Status  | [0mChecking the format of the data in <CMU_MOSEI_Labels> computational sequence ...


                                                                      

[92m[1m[2024-04-20 14:31:03.243] | Success | [0m<CMU_MOSEI_Labels> computational sequence data in correct format.
[94m[1m[2024-04-20 14:31:03.243] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_Labels> computational sequence ...




In [64]:
print(list(dataset[text_field].keys())[55])

-HwX2H8Z4hY[4]


## Split the Dataset ##

Use the standard/provided train, test, and dev splits but extract the `segment no.` (see example key in cell above) using regex

In [32]:
# Train/Dev/Test splits (based on video IDs)
train_split = DATASET.standard_folds.standard_train_fold
dev_split = DATASET.standard_folds.standard_valid_fold
test_split = DATASET.standard_folds.standard_test_fold

# Splits only contain video IDs (i.e. we need to append the segment no. as well)
print(test_split)

['7l3BNtSE0xc', 'dZFV0lyedX4', '286943', '126872', 'qgC8_emxSIU', 'kld9r0iFkWM', 'rC29Qub0U7A', '4YfyP0uIqw0', 'FMenDv3y8jc', '4wLP4elp1uM', 'KYQTwFVBzME', '27v7Blr0vjw', 'DnBHq5I52LM', 'HR18U0yAlTc', 'x266rUJQC_8', 'd1CDP6sMuLA', 'xSCvspXYU9k', '4EDblUpJieU', '4o4ilPK9rl8', '53609', 'SZ7HK5ns6mE', '243981', 'ySblgk7T7eQ', 'MYEyQUpMe3k', 'EujJ0SwiCRE', '3HyAaqre_Fk', 'iQDB_OkAQWs', 'gE7kUqMqQ9g', 'eFV7iFPYZB4', 'IRSxo_XXArg', '3hOlJf_JQDs', 'BRSyH6yfDLk', '1jogeKX0wGw', '3At-BKm9eYk', 'NVLPURuAVLU', 'pZye4zFzk3o', 'l1jW3OMXUzs', 'XKyumlBmix8', 'eKQKEi2-0Ws', 'WgI8IbJtXHw', 'tnWmVXZ87h0', 'YCEllKyaCrc', 'W1CWpktWtTs', '8wQhzezNcUY', '0bxhZ-LIfZY', 'lrjm6F3JJgg', 'Vdf1McvE9ao', 'eQc5uI7FKCU', '2QXHdu2zlQY', 'YCI-ZzclIPQ', '2Ky9DBSl49w', 'SKTyBOhDX6U', 'b86B3hP8ARM', '23656', 'kpS4BXif_Sw', 'dR68gbeOWOc', 'tC2KicUHB9Q', 'absh1hsZeF0', 'c5zxqITn3ZM', 'uogwnZGb-iE', '46495', 'Sq6DIhFxPqQ', 'PexNiFbPTYM', 'z441aDJvAcU', 'OORklkFql3k', 'WbtsuXkaGeg', 'grsV1YN1z5s', 'Gc_zIjqqUys', '424SXFTCFsA

In [33]:
# Data aligned in the format of 'video_id[segment_no]', but splits specified with video_id only
# we need to match the video IDs
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook
from collections import defaultdict

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 0

# construct a word2id mapping that automatically takes increment when new words are encountered
word2id = defaultdict(lambda: len(word2id))
UNK = word2id['<unk>']
PAD = word2id['<pad>']

# place holders for the final train/dev/test dataset splits
train = []
dev = []
test = []

# define a regular expression to extract the video ID out of the keys (e.g. )
pattern = re.compile('(.*)\[.*\]')
num_drop = 0 # a counter to count how many data points went into some processing issues

for segment in dataset[label_field].keys():
    
    # get the video ID and the features out of the aligned dataset
    vid = re.search(pattern, segment).group(1)
    label = dataset[label_field][segment]['features']
    _words = dataset[text_field][segment]['features']
    _acoustic = dataset[acoustic_field][segment]['features']

    # if the sequences are not same length after alignment, there must be some problem with some modalities
    # we should drop it or inspect the data again
    if not _words.shape[0] == _acoustic.shape[0]:
        print(f"Encountered datapoint {vid} with text shape {_words.shape}, acoustic shape {_acoustic.shape}")
        num_drop += 1
        continue

    # remove nan values
    label = np.nan_to_num(label)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding acoustic features together
    # otherwise modalities would no longer be aligned
    words = []
    acoustic = []
    for i, word in enumerate(_words):
        if word[0] != b'sp':
            words.append(word2id[word[0].decode('utf-8')]) # SDK stores strings as bytes, decode into strings here
            acoustic.append(_acoustic[i, :])

    words = np.asarray(words)
    acoustic = np.asarray(acoustic)

    # z-normalization per instance and remove nan/infs
    acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))

    if vid in train_split:
        train.append(((word, acoustic), label, segment))
    elif vid in dev_split:
        dev.append(((words, acoustic), label, segment))
    elif vid in test_split:
        test.append(((words, acoustic), label, segment))
    else:
        print(f"Found video that doesn't belong to any splits: {vid}")

print(f"Total number of {num_drop} datapoints have been dropped.")

# turn off the word2id - define a named function here to allow for pickling
def return_unk():
    return UNK
word2id.default_factory = return_unk


  acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))
  x = um.multiply(x, x, out=x)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7eu

## Inspect the dataset: ##

In [63]:
# let's see the size of each set and shape of data
print(len(train))
print(len(dev))
print(len(test))

print(train[0][0][1].shape)
print(train[0][1].shape)
print(train[0][1])

print(f"Total vocab size: {len(word2id)}")

print(word2id.keys())

16327
1871
4662
(42, 74)
(1, 7)
[[1.        0.6666667 0.6666667 0.        0.        0.        0.6666667]]
Total vocab size: 16825


## PyTorch DataLoader Functions: ##

In [37]:
def multi_collate(batch):
    '''
    Collate functions assume batch = [Dataset[i] for i in index_set]
    '''
    # for later use we sort the batch in descending order of length
    batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)
    
    # get the data out of the batch - pad text & speech sequences to be of uniform length
    labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)
    sentences = pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD)
    acoustic = pad_sequence([torch.FloatTensor(sample[0][1]) for sample in batch])
    
    # lengths are useful later in using RNNs
    lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])
    return sentences, acoustic, labels, lengths

# construct dataloaders
batch_size = 32
train_loader = DataLoader(train, shuffle=True, batch_size=batch_size, collate_fn=multi_collate)
dev_loader = DataLoader(dev, shuffle=False, batch_size=batch_size, collate_fn=multi_collate)
test_loader = DataLoader(test, shuffle=False, batch_size=batch_size, collate_fn=multi_collate)

# create a temp dataloader to see batch shape
temp_loader = iter(DataLoader(test, shuffle=True, batch_size=8, collate_fn=multi_collate))
batch = next(temp_loader)

print(batch[0].shape) # word vectors, padded to maxlen
print(batch[1].shape) # acoustic features
print(batch[2]) # labels
print(batch[3]) # lengths

torch.Size([54, 8])
torch.Size([54, 8, 74])
tensor([[ 0.0000,  0.0000,  0.3333,  0.0000,  0.0000,  0.0000,  0.0000],
        [-1.3333,  0.0000,  0.0000,  0.3333,  0.0000,  0.3333,  0.0000],
        [ 2.6667,  2.6667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.6667,  0.3333,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.3333,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.3333,  0.0000,  1.0000,  0.0000,  0.0000,  1.3333,  1.0000],
        [-3.0000,  0.0000,  0.0000,  1.3333,  0.0000,  0.3333,  0.0000],
        [-0.3333,  0.0000,  0.3333,  0.3333,  0.0000,  0.3333,  0.3333]])
tensor([54, 43, 39, 32, 27, 21, 17, 14])


In [62]:
# Let's actually inspect the transcripts to ensure it's correct
id2word = {v:k for k, v in word2id.items()}
examine_target = train
idx = np.random.randint(0, len(examine_target))
print(' '.join(list(map(lambda x: id2word[x], examine_target[idx][0][0].tolist()))))
# print(' '.join(examine_target[idx][0]))
print(examine_target[idx][1])
print(examine_target[idx][2])

KeyError: b'sp'