# Speech Understanding
## Robust Automatic Speech Recognition in Noisy Environment with Lip-Reading Assistance
### Akansha Gautam    (M23CSA506)
### Anchit Mulye      (M23CSA507)

# Import libraries

In [10]:
import numpy as np
import pandas as pd
import cv2
import torch
import os

# Set Device

In [11]:
if torch.backends.mps.is_available():
    device = 'mps'
elif torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print(f"Using device: {device}")

Using device: mps


# Load AV Speech Dataset

In [12]:
avspeech_train_path = '/Users/akanshagautam/Documents/MTech/Speech Understanding/Project/dataset/avspeech/avspeech_train.csv'
avspeech_test_path = '/Users/akanshagautam/Documents/MTech/Speech Understanding/Project/dataset/avspeech/avspeech_test.csv'
names = ["youtube_id", "start_segment", "end_segment", "x_coordinate", "y_coordinate"]

avspeech_train_df = pd.read_csv(avspeech_train_path, names=names)
avspeech_test_df = pd.read_csv(avspeech_test_path, names=names)

print(f"AVSpeech Train Dataset Shape: {avspeech_train_df.shape}")
print(f"AVSpeech Test Dataset Shape: {avspeech_test_df.shape}")

avspeech_train_df.head(2)

AVSpeech Train Dataset Shape: (2621845, 5)
AVSpeech Test Dataset Shape: (183273, 5)


Unnamed: 0,youtube_id,start_segment,end_segment,x_coordinate,y_coordinate
0,CJoOwXcjhds,233.266,239.367,0.780469,0.670833
1,AvWWVOgaMlk,90.0,93.566667,0.586719,0.311111


In [13]:
BASE_DIR = '/Users/akanshagautam/Documents/MTech/Speech Understanding/Project/dataset/avspeech/train'
temp = []

for folder in os.listdir(BASE_DIR):
    folder_path = os.path.join(BASE_DIR, folder)
    if os.path.isdir(folder_path):
        video_path = ""
        text_path = ""
        for file in os.listdir(folder_path):
            if file.endswith(".mp4"):
                video_path = os.path.join(folder_path, file)
            if file.endswith(".srt"):
                text_path = os.path.join(folder_path, file)
        temp.append({
            "youtube_id": folder,
            "video_path": video_path,
            "text_path": text_path
        })

wav_df = pd.DataFrame(temp)
print(f"Total wav files:", wav_df.shape[0])

given_avspeech_df = pd.merge(avspeech_train_df, wav_df, on="youtube_id", how="inner")
print(f"Final Dataset Shape: {given_avspeech_df.shape}")

given_avspeech_df.head(3)

Total wav files: 99
Final Dataset Shape: (1481, 7)


Unnamed: 0,youtube_id,start_segment,end_segment,x_coordinate,y_coordinate,video_path,text_path
0,-A9gdf3j2xo,295.165,298.165,0.507812,0.233333,/Users/akanshagautam/Documents/MTech/Speech Un...,/Users/akanshagautam/Documents/MTech/Speech Un...
1,QoQF8N5ZsQA,240.006433,244.961389,0.450781,0.358333,/Users/akanshagautam/Documents/MTech/Speech Un...,/Users/akanshagautam/Documents/MTech/Speech Un...
2,sujFCXbYkMo,30.0,34.466667,0.528906,0.477778,/Users/akanshagautam/Documents/MTech/Speech Un...,/Users/akanshagautam/Documents/MTech/Speech Un...


# Crop the videos

In [26]:
CROP_SIZE = 400
FPS = 25

output_dir = "/Users/akanshagautam/Documents/MTech/Speech Understanding/Project/dataset/avspeech/mouth_crops"
os.makedirs(output_dir, exist_ok=True)

for idx, row in given_avspeech_df.iterrows():
    video_path = row['video_path']
    youtube_id = row['youtube_id']
    start_time = row['start_segment']
    end_time = row['end_segment']
    x_center = row['x_coordinate']
    y_center = row['y_coordinate']

    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        continue

    fps = cap.get(cv2.CAP_PROP_FPS) or FPS
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    start_frame = int(start_time * fps)
    end_frame = int(end_time * fps)

    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
    current_frame = start_frame

    segment_id = f"{youtube_id}_{int(start_time)}_{int(end_time)}"
    video_output_dir = os.path.join(output_dir, segment_id)
    os.makedirs(video_output_dir, exist_ok=True)

    while current_frame <= end_frame:
        ret, frame = cap.read()
        if not ret:
            break

        x_px = int(x_center * width)
        y_px = int(y_center * height)

        half_crop = CROP_SIZE // 2
        x1 = max(x_px - half_crop, 0)
        y1 = max(y_px - half_crop, 0)
        x2 = min(x_px + half_crop, width)
        y2 = min(y_px + half_crop, height)
        mouth_crop = frame[y1:y2, x1:x2]

        crop_filename = os.path.join(video_output_dir, f"{current_frame}.png")
        cv2.imwrite(crop_filename, mouth_crop)

        current_frame += 1

    cap.release()

[h264 @ 0x2d6a18dd0] mmco: unref short failure
