This is the model


In [16]:
pip install tensorflow


Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install mediapipe


Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install opencv-contrib-python


Note: you may need to restart the kernel to use updated packages.


Check the version of the tensor flow

In [19]:
import tensorflow as tf
print(tf.__version__)


2.18.0


In [20]:
import tensorflow as tf
import cv2
import numpy as np
from tensorflow.keras import layers, models
import mediapipe as mp


In [21]:
# Model Parameters
IMG_HEIGHT, IMG_WIDTH = 96, 96  # Dimensions of input video frames
SEQ_LENGTH = 30  # Sequence length (number of frames per input)
NUM_CLASSES = 100  # Number of possible output characters (adjustable)


In [22]:
# Lip-Reading Model Architecture
def create_lip_reading_model():
    # Input: Sequence of video frames (batch_size, SEQ_LENGTH, IMG_HEIGHT, IMG_WIDTH, 1)
    video_input = tf.keras.Input(shape=(SEQ_LENGTH, IMG_HEIGHT, IMG_WIDTH, 1), name="video_input")

    # Convolutional Layers for Feature Extraction
    cnn = models.Sequential(name="CNN_Feature_Extractor")
    cnn.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    cnn.add(layers.MaxPooling2D((2, 2)))
    cnn.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
    cnn.add(layers.MaxPooling2D((2, 2)))
    cnn.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
    cnn.add(layers.GlobalAveragePooling2D())

    # Apply CNN to each frame in the sequence
    encoded_frames = layers.TimeDistributed(cnn, name="TimeDistributed_CNN")(video_input)

    # Recurrent Layers for Temporal Modeling
    rnn = layers.Bidirectional(layers.LSTM(256, return_sequences=True), name="BiLSTM_1")(encoded_frames)
    rnn = layers.Bidirectional(layers.LSTM(256, return_sequences=False), name="BiLSTM_2")(rnn)

    # Fully Connected Layer for Character Prediction
    dense = layers.Dense(512, activation="relu", name="Dense_Layer")(rnn)
    output = layers.Dense(NUM_CLASSES, activation="softmax", name="Output_Layer")(dense)

    # Build Model
    model = tf.keras.Model(inputs=video_input, outputs=output, name="LipReadingModel")
    return model


In [23]:
# Create the Model
model = create_lip_reading_model()


In [24]:
# Compile the Model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


In [25]:
# Real-Time Video Input Integration
def preprocess_frame(frame):
    """Preprocess a single video frame: crop, resize, and normalize."""
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, (IMG_WIDTH, IMG_HEIGHT))
    normalized = resized / 255.0
    return normalized


In [26]:
# Initialize Mediapipe Face Detection
mp_face_detection = mp.solutions.face_detection
mp_drawing = mp.solutions.drawing_utils

face_detection = mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5)


In [27]:
# Capture Video from Webcam
cap = cv2.VideoCapture(0)
frame_sequence = []


In [28]:
print("Starting real-time lip detection. Press 'q' to quit.")
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Detect face and crop lip region
    results = face_detection.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    if results.detections:
        for detection in results.detections:
            bboxC = detection.location_data.relative_bounding_box
            h, w, _ = frame.shape
            bbox = int(bboxC.xmin * w), int(bboxC.ymin * h), int(bboxC.width * w), int(bboxC.height * h)
            x, y, bw, bh = bbox
            lip_region = frame[y:y+bh, x:x+bw]

            # Preprocess and add to sequence
            preprocessed_frame = preprocess_frame(lip_region)
            frame_sequence.append(preprocessed_frame)

            # Maintain sequence length
            if len(frame_sequence) == SEQ_LENGTH:
                input_sequence = np.expand_dims(np.array(frame_sequence), axis=0)
                input_sequence = np.expand_dims(input_sequence, axis=-1)  # Add channel dimension

                # Predict using the model
                predictions = model.predict(input_sequence)
                predicted_char = np.argmax(predictions)
                print(f"Predicted Character: {predicted_char}")

                frame_sequence.pop(0)  # Remove oldest frame

            # Draw bounding box for visualization
            cv2.rectangle(frame, (x, y), (x + bw, y + bh), (255, 0, 0), 2)
    # Display the video feed
    cv2.imshow('Webcam Feed', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Starting real-time lip detection. Press 'q' to quit.


In [29]:
# Model Summary
model.summary()


In [30]:
# pip install tensorflow opencv-python mediapipe numpy
