In [None]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

# Speech Emotion Recognition using Apache Beam

<table align="left">
  <td>
    <a target="_blank" href="https://colab.sandbox.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/speech_emotion_tensorflow.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/colab_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/speech_emotion_tensorflow.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/github_32px.png" />View source on GitHub</a>
  </td>
</table>

Speech Emotion Classification is a machine learning technique that deciphers emotions from audio data. It involves data augmentation, feature extraction, preprocessing and training an appropriate model. For structured workflow design, Apache Beam is a suitable tool. This notebook showcases Apache Beam's use in speech emotion classification and achieves the following:

* Imports and processes the CREMA-D dataset for speech emotion analysis.
* Perform various data augmentation and feature extraction techniques using the [Librosa](https://librosa.org/doc/latest/index.html) library.
* Develops a TensorFlow model to classify emotions.
* Stores the trained model.
* Constructs a Beam pipeline that:
 * Creates a PCollection of audio samples.
 * Applies preprocessing transforms.
 * Utilizes the trained model to predict emotions.
 * Stores the emotion predictions.

For more insights into leveraging Apache Beam for machine learning pipelines, explore [AI/ML Pipelines using Beam](https://beam.apache.org/documentation/ml/overview/).

## Installing Apache Beam

In [None]:
!pip install apache_beam --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m671.3/671.3 kB[0m [31m57.8 MB/s[0m 


## Importing necessary libraries

Here is a brief overview of the libraries imported:
* **[os](https://docs.python.org/3/library/os.html)**: Used for file and directory operations.
* **[NumPy](https://numpy.org/doc/stable/)**: Allows efficient numerical manipulation of arrays.
* **[Pandas](https://pandas.pydata.org/docs/)**: Facilitates data manipulation and analysis.
* **[Librosa](https://librosa.org/doc/latest/index.html)**: Provides tools for analyzing and working with audio data.
* **[IPython](https://ipython.readthedocs.io/en/stable/index.html)**: Creates visualizations for multimedia content. Here we have used it for playing audio files.
* **[Sklearn](https://scikit-learn.org/stable/index.html)**: Offers comprehensive tools for Machine Learning. Here we have used it for preprocessing and splitting the data.
* **[TensorFlow](https://www.tensorflow.org/api_docs)** and **[Keras](https://keras.io/api/)**: Enables building and training complex Machine Learning and Deep Learning model.
* **[TFModelHandlerNumpy](https://beam.apache.org/documentation/sdks/python-machine-learning/#tensorflow)**: Defines the configuration used to load/use the model that we train. We use TFModelHandlerNumpy because the model was trained with TensorFlow and takes numpy arrays as input.
* **[RunInference](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.html#apache_beam.ml.inference.RunInference)**: Loads the model and obtains predictions as part of the Apache Beam pipeline. For more information, see docs on prediction and inference.
* **[Apache Beam](https://beam.apache.org/documentation/)**: Builds a pipeline for Image Processing.

In [None]:
import os

import numpy as np
import pandas as pd

import librosa
from IPython.display import Audio

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from keras import layers
from keras import models
from keras.utils import np_utils
from keras.models import Sequential
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerNumpy
from apache_beam.ml.inference.base import RunInference
import apache_beam as beam

## Importing dataset from Google Drive

[CREMA-D](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4313618/) is a dataset that contains a collection of 7442 audio recordings of actors portraying different emotions. The dataset can be downloaded from [Kaggle](https://www.kaggle.com/datasets/ejlok1/cremad). As it is large in size, it will be inconvenient to upload it on Colab every time we want to run the notebook. Instead, we have uploaded the dataset on Google Drive after downloading it from Kaggle. Then we can access it directly using Colab.

Please ensure if you are following this method, then your Colab notebook must be created with the same Google account in which the folder is stored.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


Here we create a path for the folder in Google Drive containing the audios to access them.

In [None]:
root_dir = "/content/gdrive/My Drive/"
Crema = root_dir + 'CREMA/'

Using the os library, we can list all the audio files in the Google Drive folder

In [None]:
os.chdir(Crema)
os.listdir()[:10] # Listing the first 10 audio files

['1079_TIE_NEU_XX.wav',
 '1079_TIE_SAD_XX.wav',
 '1079_TSI_ANG_XX.wav',
 '1079_TSI_DIS_XX.wav',
 '1079_TSI_HAP_XX.wav',
 '1079_TSI_FEA_XX.wav',
 '1079_TSI_NEU_XX.wav',
 '1079_TSI_SAD_XX.wav',
 '1079_WSI_ANG_XX.wav',
 '1079_WSI_DIS_XX.wav']

## Creating a DataFrame
We will create a DataFrame with two columns, path and emotion:
* Path: This will contain the path to a specific audio file in the directory.
* Emotion: This is the label which will state the emotion of an audio file.

The emotion can be extracted from the audio file name.

In [None]:
emotion_df = []

for wav in os.listdir(Crema):
    info = wav.partition(".wav")[0].split("_")
    if (len(info)<3):
        continue;
    if info[2] == 'SAD':
        emotion_df.append(("sad", Crema + "/" + wav))
    elif info[2] == 'ANG':
        emotion_df.append(("angry", Crema + "/" + wav))
    elif info[2] == 'DIS':
        emotion_df.append(("disgust", Crema + "/" + wav))
    elif info[2] == 'FEA':
        emotion_df.append(("fear", Crema + "/" + wav))
    elif info[2] == 'HAP':
        emotion_df.append(("happy", Crema + "/" + wav))
    elif info[2] == 'NEU':
        emotion_df.append(("neutral", Crema + "/" + wav))


Crema_df = pd.DataFrame.from_dict(emotion_df)
Crema_df.rename(columns={1 : "Path", 0 : "Emotion"}, inplace=True)

Crema_df.head()

Unnamed: 0,Emotion,Path
0,neutral,/content/gdrive/My Drive/CREMA//1079_TIE_NEU_X...
1,sad,/content/gdrive/My Drive/CREMA//1079_TIE_SAD_X...
2,angry,/content/gdrive/My Drive/CREMA//1079_TSI_ANG_X...
3,disgust,/content/gdrive/My Drive/CREMA//1079_TSI_DIS_X...
4,happy,/content/gdrive/My Drive/CREMA//1079_TSI_HAP_X...


## Preprocessing

The audio files we want to use are in .wav format. However, an ML model works on numerical data. So we need to perform some preprocessing operations to extract numerical features from the audios and transform these features to a more suitable form. This will improve the performance of our model.

### Data Augmentation

This is the process of transforming existing data in various ways to generate more samples and increase model robustness. We make multiple versions of the same data item but with some differences. This allows the model to recognize a wider variety of data and reduce overfitting. We have performed the following data augmentation techniques:
* **Noise injection**: Adds a random factor to all data items to provide some noise.
* **Stretching**: Alters the speed of an audio, simulating variations in speech rate or tempo.
* **Pitch Shifting**: Changes the pitch of an audio, depicting variations of  speaker characteristics or musical notes.

In [None]:
def noise(data):
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size = data.shape[0])
    return data

def stretch(data, rate = 0.8):
    return librosa.effects.time_stretch(data, rate = rate)

def pitch(data, sampling_rate, pitch_factor = 0.7):
    return librosa.effects.pitch_shift(data, sr = sampling_rate, n_steps = pitch_factor)

### Feature Extraction

We need to extract some numerical features from the audios to feed our ML model. The [Librosa](https://librosa.org/doc/latest/index.html) library allows us to do this easily.

First, we need to understand what a **mel scale** is. It is a scale of pitches that is based on the way humans perceive and discriminate between different frequencies of sound. Now, let us discuss the features we will extract from the audio:

* **Zero Crossing Rate (ZCR)**: Measures how often the sound changes it's sign (positive or negative) over time.
* **Chroma Short-Time Fourier Transform (STFT)**: Breaks down the audio signal into small segments (frames) and calculates the Fourier Transform for each frame, resulting in a time-frequency representation of the signal.
* **Mel-Frequency Cepstral Coefficients (MFCC)**: A set of coefficients derived from the mel spectrogram
* **Melspectogram**: A visual representation of the frequency content of an audio signal mapped on the mel scale.
* **Root Mean Square**: Provides the Root Mean Square value for each frame, which is a measure of the amplitude or energy of a sound signal.

You can read more about all the features we can extract using the Librosa library [here](https://librosa.org/doc/latest/feature.html).

In [None]:
def extract_features(data, sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma STFT
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # Melspectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally

    return result

The function below is used to extract the features from the audio stored at a path. Then it applies the data augmentation techniques we defined previously, and extracts features for each augmented data too. This gives us three versions of a data item:
* Normal features
* Features from data with noise
* Features from time stretched and pitch shifted data

These are added into our final dataset as individual samples.

In [None]:
def get_features(path):
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)

    # without augmentation
    normal_features = extract_features(data, sample_rate)
    result = np.array(normal_features)

    # data with noise
    noise_data = noise(data)
    noise_features = extract_features(noise_data, sample_rate)
    result = np.vstack((result, noise_features)) # stacking vertically

    # data with stretching and pitching
    stretch_data = stretch(data)
    stretch_pitch_data = pitch(stretch_data, sample_rate)
    stretch_pitch_features = extract_features(stretch_pitch_data, sample_rate)
    result = np.vstack((result, stretch_pitch_features)) # stacking vertically

    return result

Now we will iterate through the Crema_df DataFrame containing the path and emotion of each audio sample. We will extract features for each audio's three versions, add it to X, and add the corresponding emotion to Y.

In [None]:
X, Y = [], []
for path, emotion in zip(Crema_df.Path, Crema_df.Emotion):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        Y.append(emotion)

  return pitch_tuning(


Here we have made a DataFrame using the lists X and Y.

In [None]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.051835,0.552957,0.564289,0.512976,0.518041,0.528111,0.50115,0.55049,0.673705,0.744412,...,2.713831e-09,2.560777e-09,2.451516e-09,2.36935e-09,2.308e-09,2.264365e-09,2.232698e-09,2.212761e-09,2.200083e-09,neutral
1,0.08179,0.611068,0.619012,0.578897,0.580346,0.604983,0.552418,0.557888,0.677792,0.749837,...,8.333886e-05,7.936021e-05,7.905496e-05,8.138233e-05,7.764955e-05,7.412745e-05,7.555283e-05,8.043366e-05,8.144332e-05,neutral
2,0.054339,0.525215,0.525026,0.478083,0.526773,0.554233,0.521426,0.558976,0.671527,0.739728,...,3.503047e-09,3.054322e-09,2.943538e-09,2.634693e-09,2.343703e-09,2.368675e-09,2.363831e-09,1.876258e-09,6.53876e-10,neutral
3,0.050157,0.514931,0.591693,0.464526,0.429137,0.480203,0.572344,0.72263,0.699706,0.676802,...,3.512564e-09,3.153377e-09,2.90109e-09,2.715085e-09,2.576861e-09,2.47634e-09,2.403195e-09,2.354688e-09,2.325111e-09,sad
4,0.098122,0.606869,0.680955,0.572593,0.548943,0.581684,0.626757,0.75492,0.735712,0.713573,...,0.0001368801,0.0001329551,0.0001397343,0.000143389,0.0001408767,0.0001354171,0.0001373235,0.0001433754,0.0001442893,sad


The X and Y datasets are separated here. X stores the features of audio samples while Y stores the corresponding labels.

In [None]:
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

The [pad sequences](https://www.tensorflow.org/api_docs/python/tf/keras/utils/pad_sequences) function is used to pad the input data to the same length, to ensure that all samples have the same shape.

In [None]:
X = tf.keras.utils.pad_sequences(X)

Scikit Learn's [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) is used to convert categorical labels into numerical data. It creates a column in the labels dataset for each category, which contains only binary data. For example, if we have the following categories:

`[Anger, Disgust, Fear, Happy, Neutral, Sad]`

And a specific audio belongs to 'Anger' category, then the OneHotEncoder will transform it to:

`[1, 0, 0, 0, 0, 0]`

Please note that the order of which column represents which category may differ.

In [None]:
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

Splitting into train/test splits

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16744, 162), (16744, 6), (5582, 162), (5582, 6))

Now we will scale the data and split it into training and testing sets.
* [Scaling](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.scale.html) is done to make all numerical data have similar magnitudes. This makes computations easier.
* The training sets are used to train the model.
* The testing sets are used to test the model's accuracy.

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16744, 162), (16744, 6), (5582, 162), (5582, 6))

In [None]:
X.shape, x_train.shape, x_test.shape

((22326, 162), (16744, 162), (5582, 162))

We will use a 1D Convolutional layer in our model, and for that, our input data needs to be a  a 3D tensor with dimensions `(batch_size, time_steps, input_dim)`. So we will expand the dimensions of our X_train and X_test datasets. The extra 1 in the shape depicts that our data is 1 dimensional.

In [None]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16744, 162, 1), (16744, 6), (5582, 162, 1), (5582, 6))

### Training the model
We will build a sequential model to classify speech emotions using TensorFlow and Keras. Here is an overview of the layers used:
* **Conv1D**: Applies a set of filters to capture patterns in sequential data like time series or audio, enabling feature extraction through sliding convolutions.
* **Activation**: Introduces non-linearity by applying an element-wise activation function to the input, enhancing the network's learning capacity.
* **BatchNormalization**: Normalizes input activations within a mini-batch, accelerating training by stabilizing and improving gradient flow.
* **Dropout**: Randomly deactivates a fraction of neurons during training, reducing overfitting by promoting generalization.
* **MaxPooling1D**: Downsamples the input by retaining the maximum value in each local region, reducing computation.
* **Flatten**: Reshapes input data from a multidimensional format into a 1D vector, suitable for fully connected layers.
* **Dense**: Connects each neuron to every neuron in the previous layer, allowing complex relationships to be learned during training.

In the end, we need probabilities for each of the 6 classes of emotions, so we need 6 outputs. This is why the last Dense layer returns an array of size 6.


In [None]:
model = Sequential()
model.add(layers.Conv1D(256, 6, padding='same',input_shape=(x_train.shape[1],1)))
model.add(layers.Activation('relu'))
model.add(layers.Conv1D(256, 6, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.2))
model.add(layers.MaxPooling1D(pool_size=(8)))
model.add(layers.Conv1D(128, 6, padding='same'))
model.add(layers.Activation('relu'))
model.add(layers.Conv1D(128, 6, padding='same'))
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Conv1D(128, 6, padding='same'))
model.add(layers.Activation('relu'))
model.add(layers.Conv1D(128, 6, padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.2))
model.add(layers.MaxPooling1D(pool_size=(8)))
model.add(layers.Conv1D(64, 6, padding='same'))
model.add(layers.Activation('relu'))
model.add(layers.Conv1D(64, 6, padding='same'))
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(6))
model.add(layers.Activation('softmax'))
opt = keras.optimizers.Adam(learning_rate=0.0001)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 162, 256)          1792      
                                                                 
 activation (Activation)     (None, 162, 256)          0         
                                                                 
 conv1d_1 (Conv1D)           (None, 162, 256)          393472    
                                                                 
 batch_normalization (BatchN  (None, 162, 256)         1024      
 ormalization)                                                   
                                                                 
 activation_1 (Activation)   (None, 162, 256)          0         
                                                                 
 dropout (Dropout)           (None, 162, 256)          0         
                                                        

Now we will compile the model.

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

Next, we will train our model. [ReduceLROnPlateau](https://keras.io/api/callbacks/reduce_lr_on_plateau/) is used to reduce the learning rate when the loss has stopped improving. [EarlyStopping](https://keras.io/api/callbacks/early_stopping/) monitors the val_loss and stops the training process when it doesn't improve.

In [None]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

model.fit(x_train, y_train, batch_size=16, epochs=100, validation_data=(x_test, y_test), callbacks=[es, rlrp])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 00064: early stopping


<keras.callbacks.History at 0x7a8558211bd0>

We can see that the accuracy of our model is not very high. This is because speech data is more complex than other forms of data and much more training data and/or preprocessing techniques are required to build a good speech emotion classifier. If you want to increase the accuracy, you can use multiple datasets instead of just one, and use more features from the Librosa library. You can also try experimenting with LSTM layers in the model. Here are some of the popular speech emotion datasets:
* [RAVDESS](https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio)
* [LSSED](https://github.com/tobefans/LSSED)
* [TESS](https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess)
* [IEMOCAP](https://www.kaggle.com/datasets/columbine/iemocap)



### Saving model in Google Cloud Bucket
In our final Beam pipeline, we will use RunInference. For that, we need to have a pretrained model stored in a location that is accessible to a model handler. Storing the model in a Google Cloud Bucket is the easiest way to do this.

In [None]:
save_model_dir = '' # Add the link to you GCS bucket here
model.save(save_model_dir)



### Creating a model handler
A model handler is used to save, load and manage trained models. We have used TFModelHandlerNumpy since our model was built using TensorFlow and takes NumPy arrays as input.

In [None]:
model_handler = TFModelHandlerNumpy(save_model_dir)

## Preprocessing functions for Beam pipeline
We need to define some functions to perform the same preprocessing tasks we did on our training data. We can't reuse the previously defined function directly since they processed multidimensional data, and in a pipeline we deal with a single data item, which requires different methods.

This function loads the audio data using Librosa and extracts features using the previously defined function.

In [None]:
def feature_extraction(element):
  data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
  return extract_features(data, sample_rate)

Here we have scaled the data using standardization. The data is transformed such that it's mean is 0 and standard deviation is 1.

In [None]:
def scaling(element):
  element = (element-np.mean(element))/np.std(element)
  return element

In the end we will save our predictions in a list. RunInference returns an array of probabilities for each class. We select the maximum probability, replace it by 1, and replace all other values with 0. Now our new list is in a standard one hot encoded format, and we can use the inverse transform function of the OneHotEncoder to return which class the resultant array represents.

In [None]:
predictions = []

In [None]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
def save_predictions(element):
    list_of_predictions = element.inference.tolist()
    highest_prediction = max(list_of_predictions)
    l = []
    for i in range(len(list_of_predictions)):
      if list_of_predictions[i] == highest_prediction:
        l.append(1)
      else:
        l.append(0);
    ans = encoder.inverse_transform(np.array(l).reshape(1,-1))[0][0]
    predictions.append(ans)
    print(ans)

## Building the Beam Pipeline
This pipeline performs the following tasks
* Creates a PCollection of input paths
* Extracts features using the previously defined functions
* Performs scaling
* Runs inference on new data using the previously trained model
* Saves predictions in a list

In [None]:
pipeline_input = Crema_df[:2].Path

In [None]:
with beam.Pipeline() as p:
    _ = (p | beam.Create(pipeline_input)
           | beam.Map(feature_extraction)
           | beam.Map(scaling)
           | RunInference(model_handler)
           | beam.Map(save_predictions)
        )

sad
sad


In [None]:
Crema_df[:2]

Unnamed: 0,Emotion,Path
0,neutral,/content/gdrive/My Drive/CREMA//1079_TIE_NEU_X...
1,sad,/content/gdrive/My Drive/CREMA//1079_TIE_SAD_X...


In [None]:
from IPython.display import Audio
Audio(Crema_df.iloc[0].Path)

In [None]:
Audio(Crema_df.iloc[1].Path)