# install dependencies
Install necessary Python libraries for data processing and deep learning.

In [ ]:
import os  # Provides functions for interacting with the operating system.
import cv2  # OpenCV library for computer vision tasks.
import tensorflow as tf  # TensorFlow library for deep learning tasks.
import numpy as np  # NumPy library for numerical operations.
from typing import List  # Import List for type hinting.
from matplotlib import pyplot as plt  # Matplotlib's pyplot for plotting graphs.
import imageio  # Imageio for reading and writing image data.

In [ ]:
tf.config.list_physical_devices('GPU')  # Lists available GPU devices for TensorFlow.

In [ ]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)  # Enable memory growth for the GPU.
except:
    pass  # If there is no GPU, pass.

# Data Loading Functions
Define functions to load and preprocess video data for the model.

In [ ]:
import gdown  # Google Drive download utility.

In [ ]:
def load_data(path: str): 
    path = bytes.decode(path.numpy())  # Decodes the file path from bytes to string.
    file_name = path.split('\\')[-1].split('.')[0]  # Extracts the file name from the path.
    video_path = os.path.join('data','s1',f'{file_name}.mpg')  # Constructs the full path to the video file.
    alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')  # Constructs the path to the alignment file.
    frames = load_video(video_path)  # Calls the function to load video frames.
    alignments = load_alignments(alignment_path)  # Calls the function to load text alignments.
    return frames, alignments  # Returns the loaded video frames and text alignments.

In [ ]:
def load_video(path:str) -> List[float]: 
    cap = cv2.VideoCapture(path)  # Opens the video file.
    frames = []  # Initializes a list to store video frames.
    # Loops over all frames in the video.
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
        ret, frame = cap.read()  # Reads the next frame from the video.
        frame = tf.image.rgb_to_grayscale(frame)  # Converts the frame to grayscale.
        frames.append(frame[190:236,80:220,:])  # Crops and appends the frame to the list.
    cap.release()  # Releases the video file.
    mean = tf.math.reduce_mean(frames)  # Calculates the mean of the frames.
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))  # Calculates the standard deviation of the frames.
    return tf.cast((frames - mean), tf.float32) / std  # Normalizes the frames and returns them.

In [ ]:
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]  # Defines the vocabulary.

In [ ]:
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")  # Maps characters to integers.
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)  # Maps integers back to characters.

In [ ]:
def load_alignments(path:str) -> List[str]: 
    with open(path, 'r') as f:  # Opens the alignment file.
        lines = f.readlines()  # Reads all lines from the file.
    tokens = []  # Initializes a list to store tokens.
    for line in lines:  # Loops over each line in the file.
        line = line.split()  # Splits the line into parts.
        if line[2] != 'sil':  # If the token is not silence.
            tokens = [*tokens,' ',line[2]]  # Adds the token to the list.
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]  # Converts tokens to numbers and returns them.

In [ ]:
def load_data(path: str): 
    # This is a duplicated function with the same name as above. Typically, you would only need one definition. Make sure to use the correct one.

In [ ]:
def mappable_function(path:str) ->List[str]:
    result = tf.py_function(load_data, [path], (tf.float32, tf.int64))  # Wraps the Python function `load_data` to use it in TensorFlow operations.
    return result  # Returns the result of the wrapped function.

In [ ]:
test_path = '.\\data\\data\\s1\\bbal6n.mpg'  # Defines a test path for a video file.

# Data Pipeline
Set up the data pipeline for feeding data into the neural network.

In [ ]:
data = tf.data.Dataset.list_files('./data/data/s1/*.mpg')  # Creates a dataset of video file paths.
data = data.shuffle(500, reshuffle_each_iteration=False)  # Shuffles the dataset.
data = data.map(mappable_function)  # Applies the function to each element in the dataset.
data = data.padded_batch(2, padded_shapes=([75,None,None,None],[40]))  # Groups dataset elements into batches and pads them.
data = data.prefetch(tf.data.AUTOTUNE)  # Prefetches dataset elements for faster access.
train = data.take(450)  # Takes the first 450 elements for training.
test = data.skip(450)  # Skips the first 450 elements, using the rest for testing.

In [ ]:
sample = data.as_numpy_iterator()  # Creates an iterator to go through the dataset.

In [ ]:
val = sample.next(); val[0]  # Retrieves the next batch from the iterator.

In [ ]:
imageio.mimsave('./animation.gif', val[0][0], fps=10)  # Saves the first video in the batch as a GIF.

# Design Deep Neural Network
Design the architecture of the deep neural network for processing the data.

In [ ]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler  # Imports necessary modules for building and training the model.

In [ ]:
model = Sequential()  # Creates a new Sequential model.
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))  # Adds a 3D convolutional layer.
model.add(Activation('relu'))  # Adds a ReLU activation layer.
model.add(MaxPool3D((1,2,2)))  # Adds a 3D max pooling layer.
# Continues adding layers to build the complete model structure.

In [ ]:
model.summary()  # Prints a summary of the model's architecture.

In [ ]:
yhat = model.predict(val[0])  # Makes a prediction using the model.

# Train the Neural Network
Train the model on the dataset.

In [ ]:
def scheduler(epoch, lr):
    if epoch < 30:
        return lr  # Returns the learning rate unchanged for the first 30 epochs.
    else:
        return lr * tf.math.exp(-0.1)  # Reduces the learning rate for epochs after the 30th.

In [ ]:
def CTCLoss(y_true, y_pred):
    # Defines a custom function for the CTC loss calculation.
    # This function calculates the CTC loss for each batch.

In [ ]:
class ProduceExample(tf.keras.callbacks.Callback): 
    # Defines a custom callback to display examples of model predictions during training.

In [ ]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss=CTCLoss)  # Compiles the model with the Adam optimizer and custom CTC loss.

In [ ]:
checkpoint_callback = ModelCheckpoint(os.path.join('models','checkpoint'), monitor='loss', save_weights_only=True)  # Sets up a checkpointing callback.

In [ ]:
schedule_callback = LearningRateScheduler(scheduler)  # Sets up a learning rate scheduling callback.

In [ ]:
example_callback = ProduceExample(test)  # Sets up an example producing callback using the test data.

In [ ]:
model.fit(train, validation_data=test, epochs=3, callbacks=[checkpoint_callback, schedule_callback, example_callback])
  # Trains the model.

# Make Predictions
Use the trained model to make predictions on new data.

In [ ]:
url = 'https://drive.google.com/uc?id=1vWscXs4Vt0a_1IH1-ct2TCgXAZT-N3_Y'
output = 'checkpoints.zip'
gdown.download(url, output, quiet=False)  # Downloads a file from Google Drive.
gdown.extractall('checkpoints.zip', 'models')  # Extracts the downloaded file.

In [ ]:
model.load_weights('models/checkpoint')  # Loads the model weights.

In [ ]:
test_data = test.as_numpy_iterator()  # Creates an iterator for the test data.

In [ ]:
sample = test_data.next()  # Gets a sample from the test data.

In [ ]:
yhat = model.predict(sample[0])  # Predicts using the model on the test sample.

In [ ]:
# Decodes the predictions to text and compares them with the real text labels.

In [ ]:
# Continues the process of decoding and comparing predictions.

# Demo!
Demonstrates how to use the model for prediction on new video data.

In [ ]:
sample = load_data(tf.convert_to_tensor('.\\data\\s1\\bbbs7a.mpg'))  # Loads a new sample for the demonstration.

In [ ]:
# Displays the real text from the demonstration sample.

In [ ]:
yhat = model.predict(tf.expand_dims(sample[0], axis=0))  # Predicts on the new sample.

In [ ]:
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()  # Decodes the prediction.

In [ ]:
# Displays the predictions for the demonstration sample.