# Use CMU-Multimodal-SDK to Obtain Dataset #

### First install the SDK by running: ###

```console
git clone https://github.com/CMU-MultiComp-Lab/CMU-MultimodalSDK.git

cd CMU-MultiModalSDK
pip install .
```


Change the `SDK_PATH` variable in `constants/paths.py` to reflect the directory you just created.

In [11]:
from constants import SDK_PATH, DATA_PATH, WORD_EMB_PATH, CACHE_PATH
import sys

if SDK_PATH is None:
    print("SDK path is not specified! Please specify first in constants/paths.py")
    exit(0)
else:
    sys.path.append(SDK_PATH)

import mmsdk
import os
import re
import numpy as np
from mmsdk import mmdatasdk as md
from subprocess import check_call, CalledProcessError

# create folders for storing the data
if not os.path.exists(DATA_PATH):
    check_call(' '.join(['mkdir', '-p', DATA_PATH]), shell=True)

# download highlevel features, low-level (raw) data and labels for the dataset MOSEI
# if the files are already present, instead of downloading it you just load it yourself.

DATASET = md.cmu_mosei

try:
    md.mmdataset(DATASET.highlevel, DATA_PATH)
except RuntimeError:
    print("High-level features have been downloaded previously.")

try:
    md.mmdataset(DATASET.raw, DATA_PATH)
except RuntimeError:
    print("Raw data have been downloaded previously.")
    
try:
    md.mmdataset(DATASET.labels, DATA_PATH)
except RuntimeError:
    print("Labels have been downloaded previously.")

[91m[1m[2024-04-20 13:35:54.126] | Error   | [0m./data/CMU_MOSEI_TimestampedWordVectors.csd file already exists ...
High-level features have been downloaded previously.
[94m[1m[2024-04-20 13:35:55.282] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSEI/language/CMU_MOSEI_TimestampedWords.csd to ./data/CMU_MOSEI_TimestampedWords.csd...


                                                                     

[92m[1m[2024-04-20 13:36:01.751] | Success | [0mDownload complete!
[92m[1m[2024-04-20 13:36:01.780] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWords.csd ...
[94m[1m[2024-04-20 13:36:01.975] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-04-20 13:36:01.975] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:02.614] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:02.614] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[94m[1m[2024-04-20 13:36:02.704] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSEI/language/CMU_MOSEI_TimestampedPhones.csd to ./data/CMU_MOSEI_TimestampedPhones.csd...


                                                                     

[92m[1m[2024-04-20 13:36:11.273] | Success | [0mDownload complete!
[92m[1m[2024-04-20 13:36:11.275] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedPhones.csd ...
[94m[1m[2024-04-20 13:36:11.322] | Status  | [0mChecking the integrity of the <phoneme> computational sequence ...
[94m[1m[2024-04-20 13:36:11.322] | Status  | [0mChecking the format of the data in <phoneme> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:12.007] | Success | [0m<phoneme> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:12.007] | Status  | [0mChecking the format of the metadata in <phoneme> computational sequence ...
[92m[1m[2024-04-20 13:36:12.007] | Success | [0mDataset initialized successfully ... 
[94m[1m[2024-04-20 13:36:12.143] | Status  | [0mDownloading from http://immortal.multicomp.cs.cmu.edu/CMU-MOSEI/labels/CMU_MOSEI_Labels.csd to ./data/CMU_MOSEI_Labels.csd...


                                                                     

[92m[1m[2024-04-20 13:36:15.771] | Success | [0mDownload complete!
[92m[1m[2024-04-20 13:36:15.772] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_Labels.csd ...
[94m[1m[2024-04-20 13:36:15.816] | Status  | [0mChecking the integrity of the <All Labels> computational sequence ...
[94m[1m[2024-04-20 13:36:15.816] | Status  | [0mChecking the format of the data in <All Labels> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:16.288] | Success | [0m<All Labels> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:16.288] | Status  | [0mChecking the format of the metadata in <All Labels> computational sequence ...
[92m[1m[2024-04-20 13:36:16.288] | Success | [0mDataset initialized successfully ... 




In [12]:
# list the directory contents (should see .csd files which are computational sequences)
data_files = os.listdir(DATA_PATH)
print('\n'.join(data_files))

CMU_MOSEI_TimestampedWordVectors.csd
CMU_MOSEI_COVAREP.csd
CMU_MOSEI_TimestampedPhones.csd
CMU_MOSEI_TimestampedWords.csd
CMU_MOSEI_Labels.csd


## Load the Dataset: ##

Create a recipe (dictionary of the form `{feature_name: csd_path}`) and feed it into a `mmdataset` object

In [13]:
# define the modalities we want
acoustic_field = 'CMU_MOSEI_COVAREP'
text_field = 'CMU_MOSEI_TimestampedWords'


features = [
    text_field, 
    acoustic_field
]

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
dataset = md.mmdataset(recipe)

[92m[1m[2024-04-20 13:36:34.146] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWords.csd ...
[94m[1m[2024-04-20 13:36:34.534] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-04-20 13:36:34.534] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:35.105] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:35.105] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2024-04-20 13:36:35.105] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_COVAREP.csd ...
[94m[1m[2024-04-20 13:36:35.395] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2024-04-20 13:36:35.396] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                                  

[92m[1m[2024-04-20 13:36:36.518] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2024-04-20 13:36:36.518] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2024-04-20 13:36:36.518] | Success | [0mDataset initialized successfully ... 


