# Mediapipe Optimized Training Notebook

This notebook demonstrates an optimized pipeline for sign language recognition using Mediapipe keypoints and a neural network classifier. It features:
- GPU detection/configuration
- Mixed precision logic (enabled if supported)
- Batch size and memory tips
- Progress bars for long steps
- Clear training/evaluation summaries

In [None]:
# Section 1: Install and Import Required Libraries
import os
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import mixed_precision
import mediapipe as mp
print('‚úì All libraries imported successfully')

In [None]:
# Section 2: GPU Detection and Configuration
print('='*60)
print('üîç GPU DETECTION & CONFIGURATION')
print('='*60)
gpus = tf.config.list_physical_devices('GPU')
print(f'Found GPUs: {gpus}')
USE_GPU = False
DEVICE = '/CPU:0'
if gpus:
    try:
        for g in gpus:
            tf.config.experimental.set_memory_growth(g, True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        USE_GPU = True
        DEVICE = '/GPU:0'
        print(f'‚úÖ GPU configured: {gpus[0]}')
    except RuntimeError as e:
        print('‚ö†Ô∏è  GPU config error:', e)
# Mixed precision (optional and beneficial on modern GPUs)
try:
    if USE_GPU:
        policy = mixed_precision.Policy('mixed_float16')
        mixed_precision.set_global_policy(policy)
        print('‚ö° Mixed precision enabled:', policy.name)
except Exception as e:
    print('‚ö†Ô∏è  Mixed precision not enabled:', e)
print('Configuration complete. Using device:', DEVICE)

## Batch Size and Memory Tips
- If you get 'Out of Memory' errors, reduce `BATCH_SIZE` or disable mixed precision.
- Monitor GPU usage with `nvidia-smi -l 1` in a separate terminal.
- Close other GPU-intensive applications during training for best performance.

In [None]:
# Section 3: Extract Mediapipe Keypoints from Dataset
# Update DATASET_DIR to your dataset location
DATASET_DIR = r'M:\Term 9\Grad\Main\Sign-Language-Recognition-System-main\Sign-Language-Recognition-System-main\Sign_to_Sentence Project Main\ASL Letter (English)\Datasets\Asl_Sign_Data\asl_alphabet_train'
CSV_PATH = 'asl_mediapipe_keypoints_dataset_optimized.csv'
if os.path.exists(CSV_PATH):
    print('Dataset CSV already exists:', CSV_PATH)
    df = pd.read_csv(CSV_PATH)
    print(f'Samples loaded: {len(df)}')
else:
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.7)
    landmark_data = []
    labels = []
    class_labels = sorted([d for d in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, d))])
    all_images = []
    for label in class_labels:
        folder = os.path.join(DATASET_DIR, label)
        files = [f for f in os.listdir(folder) if f.lower().endswith(('.png','.jpg','.jpeg'))]
        for f in files:
            all_images.append((label, os.path.join(folder, f)))
    print(f'Found {len(all_images)} images across {len(class_labels)} classes')
    start = pd.Timestamp.now()
    processed = 0
    skipped = 0
    for label, img_path in tqdm(all_images, desc='Extracting keypoints'):
        img = cv2.imread(img_path)
        if img is None:
            skipped += 1
            continue
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:
            hand_landmarks = results.multi_hand_landmarks[0]
            lm = np.array([[p.x, p.y, p.z] for p in hand_landmarks.landmark]).flatten()
            landmark_data.append(lm)
            labels.append(label)
            processed += 1
        else:
            skipped += 1
    duration = (pd.Timestamp.now() - start).total_seconds()
    if len(landmark_data) == 0:
        print('No landmarks extracted. Check dataset or MediaPipe settings.')
        df = pd.DataFrame()
    else:
        df = pd.DataFrame(landmark_data)
        df['label'] = labels
        df.to_csv(CSV_PATH, index=False)
        print('Saved keypoints to', CSV_PATH)
        print(f'Processed: {processed}, Skipped: {skipped}, Time: {duration:.2f}s')

In [None]:
# Section 4: Preprocess and Split Data
if not df.empty:
    X = df.iloc[:, :-1].astype('float32').values
    y = df['label'].values
    encoder = LabelEncoder()
    y_enc = encoder.fit_transform(y)
    num_classes = len(encoder.classes_)
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)
    y_train = to_categorical(y_train, num_classes=num_classes)
    y_val = to_categorical(y_val, num_classes=num_classes)
    y_test = to_categorical(y_test, num_classes=num_classes)
    print('Train/Val/Test sizes:', X_train.shape[0], X_val.shape[0], X_test.shape[0])

In [None]:
# Section 5: Build and Train MLP Model
AUTOTUNE = tf.data.AUTOTUNE
def make_dataset(features, labels, batch_size, training=True):
    ds = tf.data.Dataset.from_tensor_slices((features, labels))
    if training:
        buffer = min(len(features), 10000)
        ds = ds.shuffle(buffer_size=buffer, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(AUTOTUNE)
    return ds

BATCH_SIZE = 256 if USE_GPU else 64
train_ds = make_dataset(X_train, y_train, BATCH_SIZE, training=True)
val_ds = make_dataset(X_val, y_val, BATCH_SIZE, training=False)

with tf.device(DEVICE):
    model = Sequential([
        Dense(256, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(1e-4), input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(1e-4)),
        BatchNormalization(),
        Dropout(0.25),
        Dense(64, activation='relu', kernel_initializer='he_normal'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax', dtype='float32')
    ])
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    print('Model summary:')
    model.summary()
    print(f'Model will train on device: {DEVICE}')

callbacks = [
    ModelCheckpoint('mediapipe_mlp_model_best.h5', monitor='val_accuracy', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7, verbose=1),
]

print('Training with batch size:', BATCH_SIZE)
start = pd.Timestamp.now()
with tf.device(DEVICE):
    history = model.fit(train_ds, validation_data=val_ds, epochs=20, callbacks=callbacks, verbose=1)
duration = (pd.Timestamp.now() - start).total_seconds()
print(f'Training finished in {duration:.2f}s')
model.save('mediapipe_mlp_model.h5')
print('Saved model: mediapipe_mlp_model.h5')

In [None]:
# Section 6: Evaluate Model
if not df.empty:
    eval_batch = 256 if USE_GPU else 128
    test_ds = make_dataset(X_test, y_test, eval_batch, training=False)
    with tf.device(DEVICE):
        loss, acc = model.evaluate(test_ds, verbose=1)
    print(f'Test Loss: {loss:.4f}  Test Accuracy: {acc*100:.2f}%')

In [None]:
# Section 7: Real-Time Inference (Webcam)
encoder = LabelEncoder()
df_labels = pd.read_csv(CSV_PATH)
encoder.fit(df_labels['label'])
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7)
cap = cv2.VideoCapture(0)
predicted_sentence = ''
mlp_model = tf.keras.models.load_model('mediapipe_mlp_model.h5')
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)
    if results.multi_hand_landmarks:
        for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            lm = np.array([[p.x, p.y, p.z] for p in hand_landmarks.landmark]).flatten().reshape(1, -1)
            input_tensor = tf.cast(lm, tf.float32)
            with tf.device(DEVICE):
                pred = mlp_model.predict(input_tensor, verbose=0)
            pred_class = np.argmax(pred)
            label = encoder.inverse_transform([pred_class])[0]
            if label == 'SPACE':
                predicted_sentence += ' '
            elif label == 'DELETE':
                predicted_sentence = predicted_sentence[:-1]
            elif label == 'NOTHING':
                pass
            else:
                predicted_sentence += label
    h, w, _ = frame.shape
    cv2.rectangle(frame, (0, h-60), (w, h), (0, 0, 0), -1)
    cv2.putText(frame, predicted_sentence, (10, h-20), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2)
    cv2.imshow('Optimized Sign Prediction (MediaPipe + MLP)', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()