In [None]:
from difflib import SequenceMatcher

def clustering(str,list1,list2):
    score = []
    for i in range(len(list1)):
        ratio = SequenceMatcher(None,str,list1[i]).ratio()
        score.append(ratio)
    index = score.index(max(score))
    return list2[index]

In [1]:
import cv2

def check_webcam():
    # 웹캠 초기화
    cap = cv2.VideoCapture(2)

    if not cap.isOpened():
        print("웹캠을 열 수 없습니다.")
        return

    print("웹캠을 열었습니다. 'q' 키를 눌러 종료하세요.")

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("프레임을 읽을 수 없습니다.")
                break

            # 프레임을 화면에 표시
            cv2.imshow('Webcam', frame)

            # 'q' 키를 누르면 종료
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
    finally:
        cap.release()
        cv2.destroyAllWindows()

if __name__ == "__main__":
    check_webcam()


웹캠을 열었습니다. 'q' 키를 눌러 종료하세요.


In [11]:
import numpy as np
import cv2
import torch
import torch.nn as nn
from torchvision import transforms, models
from PIL import Image, ImageFont, ImageDraw
import dlib
import os
from difflib import SequenceMatcher

# CUDA 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# dlib의 얼굴 랜드마크 모델 경로 설정
landmark_model_path = "shape_predictor_68_face_landmarks.dat"
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(landmark_model_path)

# 레이블 매핑
label_to_idx = {0: '아',
 1: '왠',
 2: '어',
 3: '우',
 4: '애',
 5: '안',
 6: '마',
 7: '으',
 8: '여',
 9: '임',
 10: '오',
 11: '워'}
idx_to_label = {v: k for k, v in label_to_idx.items()}

# MobileNetV2 + LSTM + Dropout 모델 정의
class MobileNetV2LSTM(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.6):
        super(MobileNetV2LSTM, self).__init__()
        mobilenet = models.mobilenet_v2(pretrained=True)
        mobilenet.features = nn.Sequential(
            *list(mobilenet.features),
            nn.AdaptiveAvgPool2d((1, 1))  # 추가: AdaptiveAvgPool2d로 마지막 출력 크기를 (1, 1)로 만듦
        )
        self.mobilenet = mobilenet.features
        self.lstm = nn.LSTM(1280, 256, batch_first=True)  # MobileNetV2의 출력 크기 1280
        self.dropout = nn.Dropout(dropout_rate)  # Dropout 레이어 추가
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.size()
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.mobilenet(x).squeeze(-1).squeeze(-1)  # (batch_size * seq_length, 1280, 1, 1) -> (batch_size * seq_length, 1280)
        x = x.view(batch_size, seq_length, -1)
        x, _ = self.lstm(x)
        x = self.dropout(x)  # Dropout 적용
        x = self.fc(x[:, -1, :])
        return x

# 모델 초기화 및 로드
num_classes = len(label_to_idx)
model = MobileNetV2LSTM(num_classes).to(device)
model.load_state_dict(torch.load('lip_reading_model_1.pth'))
model.eval()

# 데이터 전처리
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

def extract_lip_shape(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = detector(gray)

    for face in faces:
        landmarks = predictor(gray, face)
        lip_points = []

        # 입술 영역의 랜드마크 점들을 추출
        for i in range(48, 61):
            x = landmarks.part(i).x
            y = landmarks.part(i).y
            lip_points.append((x, y))

        return lip_points
    return None

def crop_lips_from_frame(frame, lip_points):
    if lip_points is None:
        return None

    # 입술 영역의 좌표들로부터 사각형 영역 계산
    x_coords = [p[0] for p in lip_points]
    y_coords = [p[1] for p in lip_points]

    x_min = min(x_coords)
    x_max = max(x_coords)
    y_min = min(y_coords)
    y_max = max(y_coords)

    # 입술 영역을 사각형으로 자르기
    cropped_lips = frame[y_min:y_max, x_min:x_max]
    return cropped_lips

def predict_lip_sequence(lip_frames):
    images = [transform(Image.fromarray(frame)) for frame in lip_frames]
    images_tensor = torch.stack(images).unsqueeze(0).to(device)  # (1, sequence_length, C, H, W)
    with torch.no_grad():
        outputs = model(images_tensor)
        _, predicted = torch.max(outputs, 1)
        return label_to_idx[predicted.item()]

def save_frames_to_folder(frames, label, folder_path='saved_frames'):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    for i, frame in enumerate(frames):
        cv2.imwrite(os.path.join(folder_path, f"{label}_{i}.png"), frame)

def clustering(str, list1, list2):
    score = []
    for i in range(len(list1)):
        ratio = SequenceMatcher(None, str, list1[i]).ratio()
        score.append(ratio)
    index = score.index(max(score))
    if len(list2[index]) == len(str):
        return list2[index]
    return None

def put_korean_text(image, text, position, font_path='/usr/share/fonts/truetype/nanum/NanumGothic.ttf', font_size=32, color=(255, 255, 255)):
    """한글 텍스트를 이미지에 합성하는 함수"""
    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(image_pil)
    font = ImageFont.truetype(font_path, font_size)
    draw.text(position, text, font=font, fill=color)
    return cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)

def main():
    # 웹캠 초기화
    cap = cv2.VideoCapture(2)

    frame_count = 0
    save_frame_count = 0
    lip_frames = []
    recording = False
    predictions = []
    predicted_word = ""
    clustered_word = ""
    
    str_list = ['안여',
                '아마오',
                '임으어',
                '오아워',
                '왠안아',
                '우우',
                '오아',
                '오아워',
                '오여',
                '아아애']
    answer_list = ['안녕',
                    '아마도',
                    '힘들어',
                    '놀라워',
                    '괜찮아',
                    '누구',
                    '좋아',
                    '고마워',
                    '졸려',
                    '사랑해']

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                continue

            lip_points = extract_lip_shape(frame)

            if lip_points:
                cropped_lips = crop_lips_from_frame(frame, lip_points)
                if cropped_lips is not None and cropped_lips.size > 0:
                    cv2.imshow('Cropped Lips', cropped_lips)
                    if recording and frame_count % 2 == 0:  # 3프레임마다 하나씩 저장
                        lip_frames.append(cropped_lips)
                        save_frame_count += 1

            frame_count += 1

            if save_frame_count == 10:  # 10장의 이미지를 저장
                print("Evaluating...")
                result = predict_lip_sequence(lip_frames)
                predictions.append(result)
                print(f'Predicted Syllable: {result}')
                save_frames_to_folder(lip_frames, f'{result}_segment_{len(predictions)}')
                lip_frames = []
                save_frame_count = 0
                recording = False

                # 저장된 예측값을 바탕으로 단어 유추
                predicted_word = ''.join(predictions)
                print(f'Predicted Word: {predicted_word}')

                clustered_word = clustering(predicted_word, str_list, answer_list)
                if clustered_word:
                    print(clustered_word)

            # 화면에 예측 단어 출력
            if clustered_word:
                frame = put_korean_text(frame, f'Predicted Word: {clustered_word}', (50, 100))

            cv2.imshow('Webcam', frame)
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break          
            elif key == ord('s'):
                recording = True
            elif key == ord('e'):  # 'e' 입력시 예측 배열 초기화
                predictions = []
                predicted_word = ""
                clustered_word = ""

    finally:
        cap.release()
        cv2.destroyAllWindows()

if __name__ == "__main__":
    main()


Using device: cuda
Evaluating...
Predicted Syllable: 임
Predicted Word: 임
Evaluating...
Predicted Syllable: 임
Predicted Word: 임임
Evaluating...
Predicted Syllable: 안
Predicted Word: 임임안
Evaluating...
Predicted Syllable: 안
Predicted Word: 임임안안


In [38]:
mobilenet = MobileNetV2LSTM(num_classes=12)

