<a href="https://colab.research.google.com/github/annbinus/ASLExpress/blob/akhila's-branch/ASLLearningTool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
#pip install --upgrade mediapipe

In [74]:
#pip install english-words

In [75]:
import numpy as np
import pandas as pd

# Import the OpenCV library for computer vision tasks.
import cv2 as cv

# Import the Mediapipe library for various computer vision and machine learning tasks.
import mediapipe as mp

# Import the joblib library for saving and loading machine learning models.
import joblib

# Import the time module for time-related operations or measurements.
import time

# Import the os and sys modules for operating system and system-related functions.
import os, sys

In [76]:
from random import random

In [77]:
# Import the function 'get_english_words_set' from the 'english_words' library.
from english_words import get_english_words_set

# Create a set of English words from the 'web2' dataset in lowercase.
# The 'get_english_words_set' function is used with the 'lower' parameter set to 'True'
# to ensure that the words are converted to lowercase.
web2lowerset = get_english_words_set(['web2'], lower=True)

In [78]:
# Import necessary modules from the Flask framework
from flask import Flask           # Flask is used to create the web application instance.
from flask import render_template # render_template is used to render HTML templates.
from flask import Response        # Response is used to create HTTP responses.
from flask import request         # request is used to access data sent with HTTP requests.

In [79]:
# Global Variables for Game Difficulty and Mode
global easy, medium, hard, freestyle, switch
level1 = 0        # Represents the easy game mode.
level2 = 0      # Represents the medium game mode.
level3 = 0        # Represents the hard game mode.
freestyle = 0   # Represents the freestyle game mode.

In [80]:
# Load a pre-trained machine learning model (Random Forest Classifier) from a file named "random_forest.joblib"
clf = joblib.load("/content/random_forest.joblib")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [81]:
app = Flask(__name__, template_folder='./template')

In [82]:
# Function to perform object detection using a provided model.
# Args:
#   image (numpy.ndarray): Input image in BGR format.
#   model: A pre-trained machine learning model (e.g., Mediapipe model).
# Returns:
#   image (numpy.ndarray): Processed image in BGR format.
#   results: Detection results from the model.
def mediapipe_detection(image, model):
    # Convert the input image from BGR to RGB color space.
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)

    # Make predictions using the provided model.
    final_results = model.process(image)

    # Convert the processed image from RGB back to BGR color space.
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR)

    # Return the processed image and detection results.
    return image, final_results

In [83]:
# Function to calculate and return the normalized landmark distances relative to the wrist position.
# Args:
#   results: Detection results containing hand landmarks.
#   frame_width (int): Width of the frame or image.
#   frame_height (int): Height of the frame or image.
# Returns:
#   normalized_landmark_distances (list): List of normalized landmark distances (x and y coordinates).
def get_normalized_landmark_distances(results, frame_width, frame_height):
    normalized_landmark_distances = []  # Initialize an empty list to store normalized landmark distances.

    # Get the wrist position (the reference point) of the detected hand.
    wrist_position = results.multi_hand_landmarks[0].landmark[0]

    # Calculate and normalize the distances of each landmark relative to the wrist position.
    for landmark in results.multi_hand_landmarks[0].landmark:
        # Calculate and normalize the x and y distances based on the frame width and height.
        normalized_x = (landmark.x - wrist_position.x) * (frame_width / frame_width)
        normalized_y = (landmark.y - wrist_position.y) * (frame_height / frame_height)

        # Append the normalized distances to the normalized_landmark_distances list.
        normalized_landmark_distances.append(normalized_x)
        normalized_landmark_distances.append(normalized_y)

    # Return the list of normalized landmark distances, excluding the wrist position (first two values).
    return normalized_landmark_distances[2:]


In [84]:
# Function to determine the maximum available camera index connected to the host device.
# Returns the maximum camera index as an integer.

def camera_max():
    camera_index = 0  # Initialize the camera index to 0.

    while True:
        # Check if the camera with the current index can be accessed.
        if cv.VideoCapture(camera_index).grab():
            camera_index += 1
        else:
            # Close any open windows and return the maximum camera index (camera_index - 1).
            cv.destroyAllWindows()
            return max(0, int(camera_index - 1))


In [85]:
# Create a list of words that are sorted, do not contain 'z', and have a length between 4 and 10 characters.
words = [word for word in sorted(list(web2lowerset)) if 'z' not in word and 3 < len(word) <= 10]

# Initialize some variables
start_time = time.time()
curr_time = 0
user_input_word = ''
eraser = 0

# Choose a random word from the 'words' list and convert it to uppercase
random_word = words[int(random() * len(words))].upper()
random_word_index = 0

# Create a list of lowercase letters
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# Initialize video capture using the maximum available camera
cap = cv.VideoCapture(camera_max())
width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))

# Set up the MediaPipe hands model
mp_hands = mp.solutions.hands
with mp_hands.Hands(min_detection_confidence=0.6, min_tracking_confidence=0.6, max_num_hands=1) as hands:
    while cap.isOpened():
        # Read a frame from the camera
        ret, frame = cap.read()

        try:
            # Display the 'random_word' on the frame
            cv.putText(frame, random_word, (int(width * 0.05), int(height * 0.95)), cv.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv.LINE_4)

            # Display the 'user_input_word' on the frame
            cv.putText(frame, user_input_word, (int(width * 0.05), int(height * 0.95)), cv.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 2, cv.LINE_4)
        except Exception as e:
            print(e)

        # Make hand detections using MediaPipe
        image, results = mediapipe_detection(frame, hands)

        # Load an image of the letter corresponding to the current character in 'random_word'
        letter_image = cv.resize(cv.imread('easy_mode_letters/{}.png'.format(random_word[random_word_index].lower())), (0, 0), fx=0.2, fy=0.2)

        # Find the bounding box of the detected hand
        if results.multi_hand_landmarks:
            x = [None, None]
            y = [None, None]
            for result in results.multi_hand_landmarks[0].landmark:
                if x[0] is None or result.x < x[0]:
                    x[0] = result.x
                if x[1] is None or result.x > x[1]:
                    x[1] = result.x

                if y[0] is None or result.y < y[0]:
                    y[0] = result.y
                if y[1] is None or result.y > y[1]:
                    y[1] = result.y

        # Check if the current time is less than one-third of the time elapsed since the start
        if curr_time < round((time.time() - start_time) / 3, 1):
            # Update current time with one-third of the elapsed time
            curr_time = round((time.time() - start_time) / 3, 1)

            try:
                # Get landmark distances from a function
                test_image = get_landmark_dist_test(results, x[1] - x[0], y[1] - y[0])

                # Predict a class label using a machine learning classifier
                test_pred = np.argmax(clf.predict_proba(np.array([test_image])))
                test_probs = clf.predict_proba(np.array([test_image]))[0]

                # Check if the maximum predicted probability is above a threshold or
                # if it's moderately high and the predicted letter is 'r' or 'v'
                if max(test_probs) >= 0.8 or (max(test_probs) >= 0.6 and letters[test_pred] in ['r', 'v']):
                    pred_letter = letters[test_pred].upper()

                    # Check if the predicted letter matches the expected letter in the word
                    if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and (easy_word_index == 0 or easy_word[easy_word_index] != easy_word[easy_word_index - 1]):
                        easy_word_user += pred_letter
                        easy_word_index += 1
                        location = results.multi_hand_landmarks[0].landmark[0].x

                    # Check if the predicted letter matches the expected letter,
                    # and if the previous and current letters are the same,
                    # and if the hand location has changed significantly
                    if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and easy_word_index > 0 and easy_word[easy_word_index] == easy_word[easy_word_index - 1] and abs(location - results.multi_hand_landmarks[0].landmark[0].x) > 0.1:
                        easy_word_user += pred_letter
                        easy_word_index += 1
                        location = results.multi_hand_landmarks[0].landmark[0].x

                # Check if the user has correctly spelled the word
                if easy_word_user == easy_word:
                    time.sleep(0.5)
                    # Select a new random word and reset variables
                    easy_word = words[int(random() * len(words))].upper()
                    easy_word_index = 0
                    easy_word_user = ''

            except Exception as e:
                print(e)

        # Display an image on the screen
        frame[5:5 + letter_help.shape[0], width - 5 - letter_help.shape[1]:width - 5] = letter_help
        cv.imshow('OpenCV Feed', frame)

        # Break the loop if the user presses the 'Esc' key
        key = cv.waitKey(20)
        if key == 27:
            break

# Release the video capture and close all OpenCV windows
cap.release()
cv.destroyAllWindows()


In [86]:
# Define a list of lowercase letters
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# Create a list of sorted words that do not contain 'z' and have a length between 4 and 10 characters
words = [i for i in sorted(list(web2lowerset)) if 'z' not in i and len(i) > 3 and len(i) <= 10]

# Initialize some variables
start_time = time.time()
curr_time = 0
user_input_word = ''
eraser = 0

# Choose a random word from the 'words' list and convert it to uppercase
easy_word = words[int(random() * len(words))].upper()
easy_word_index = 0
location = 0
letter_help = 0

In [87]:
def easy_mode(frame):
    # Declare global variables
    global cap, easy_word_user, easy_word, easy_word_index, curr_time, location, letter_help

    def mediapipe_detection(image, model):
        # Convert image color from BGR to RGB
        image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        # Make predictions using the provided model
        results = model.process(image)
        # Convert the image color back from RGB to BGR
        image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
        return image, results

    def get_landmark_dist_test(results, x, y):
        # Initialize an empty list to store landmark distances
        hand_array = []
        # Get the wrist position
        wrist_pos = results.multi_hand_landmarks[0].landmark[0]
        for result in results.multi_hand_landmarks[0].landmark:
            # Calculate and append the scaled distances relative to the wrist
            hand_array.append((result.x - wrist_pos.x) * (width / x))
            hand_array.append((result.y - wrist_pos.y) * (height / y))
        return hand_array[2:]

    # Main function
    # Get frame dimensions
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))

    # Set up the MediaPipe hands model
    mp_hands = mp.solutions.hands
    with mp_hands.Hands(min_detection_confidence=0.6, min_tracking_confidence=0.6, max_num_hands=1) as hands:
        while cap.isOpened():
            # Read a frame from the camera
            ret, frame = cap.read()

            try:
                # Display the 'easy_word' on the frame
                cv.putText(frame, easy_word, (int(width * 0.05), int(height * 0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv2.LINE_4)
                # Display the 'easy_word_user' on the frame
                cv.putText(frame, easy_word_user, (int(width * 0.05), int(height * 0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 2, cv2.LINE_4)
            except Exception as e:
                print(e)

            # Make hand detections using MediaPipe
            image, results = mediapipe_detection(frame, hands)

            # Load an image of the letter corresponding to the current character in 'easy_word'
            letter_help = cv2.resize(cv2.imread('easy_mode_letters/{}.png'.format(easy_word[easy_word_index].lower())), (0, 0), fx=0.2, fy=0.2)

            # Find the bounding box of the detected hand
            if results.multi_hand_landmarks:
                x = [None, None]
                y = [None, None]
                for result in results.multi_hand_landmarks[0].landmark:
                    if x[0] is None or result.x < x[0]:
                        x[0] = result.x
                    if x[1] is None or result.x > x[1]:
                        x[1] = result.x

                    if y[0] is None or result.y < y[0]:
                        y[0] = result.y
                    if y[1] is None or result.y > y[1]:
                        y[1] = result.y

                # Check if the current time is less than one-third of the time elapsed since the start
                if curr_time < round((time.time() - start_time) / 3, 1) and x[0] is not None:
                    curr_time = round((time.time() - start_time) / 3, 1)
                    try:
                        # Get landmark distances from a function
                        test_image = get_landmark_dist_test(results, x[1] - x[0], y[1] - y[0])
                        # Predict a class label using a machine learning classifier
                        test_pred = np.argmax(clf.predict_proba(np.array([test_image])))
                        test_probs = clf.predict_proba(np.array([test_image]))[0]
                        print("Predicted:", letters[test_pred], ", pred prob:", max(test_probs), ", current index:", easy_word_index, ", current time:", curr_time)
                        if max(test_probs) >= 0.8 or (max(test_probs) >= 0.6 and letters[test_pred] in ['p', 'r', 'u', 'v']):
                            pred_letter = letters[test_pred].upper()
                            if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and (easy_word_index == 0 or easy_word[easy_word_index] != easy_word[easy_word_index - 1]):
                                easy_word_user += pred_letter
                                easy_word_index += 1
                                location = results.multi_hand_landmarks[0].landmark[0].x
                            if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and easy_word_index > 0 and easy_word[easy_word_index] == easy_word[easy_word_index - 1] and abs(location - results.multi_hand_landmarks[0].landmark[0].x) > 0.1:
                                easy_word_user += pred_letter
                                easy_word_index += 1
                                location = results.multi_hand_landmarks[0].landmark[0].x

                        if easy_word_user == easy_word:
                            time.sleep(0.5)
                            # Select a new random word and reset variables
                            easy_word = words[int(random() * len(words))].upper()
                            easy_word_index = 0
                            easy_word_user = ''

                    except Exception as e:
                        print(e)

            # Show letter helper
            frame[5:5 + letter_help.shape[0], width - 5 - letter_help.shape[1]:width - 5] = letter_help

    return frame


In [88]:
def medium_mode(frame):
    global cap, easy_word_user, easy_word, easy_word_index, curr_time, location, letter_help

    def mediapipe_detection(image, model):
        # Convert image from BGR to RGB color format
        image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        # Make predictions using the provided model
        results = model.process(image)
        # Convert the image back to BGR format
        image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
        return image, results

    def get_landmark_distances(results, x, y):
        hand_array = []
        wrist_pos = results.multi_hand_landmarks[0].landmark[0]
        for result in results.multi_hand_landmarks[0].landmark:
            hand_array.append((result.x - wrist_pos.x) * (width / x))
            hand_array.append((result.y - wrist_pos.y) * (height / y))
        return hand_array[2:]

    # Main function
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Set up the MediaPipe hands model
    mp_hands = mp.solutions.hands
    with mp_hands.Hands(min_detection_confidence=0.6, min_tracking_confidence=0.6, max_num_hands=1) as hands:
        while cap.isOpened():

            # Read a frame from the camera
            # ret, frame = cap.read()

            try:
                # Display the 'easy_word' and 'easy_word_user' on the frame
                cv.putText(frame, easy_word, (int(width * 0.05), int(height * 0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv2.LINE_4)
                cv.putText(frame, easy_word_user, (int(width * 0.05), int(height * 0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 2, cv2.LINE_4)
            except Exception as e:
                print(e)

            # Make hand detections using MediaPipe
            image, results = mediapipe_detection(frame, hands)

            # Find the bounding box of the detected hand
            if results.multi_hand_landmarks:
                x = [None, None]
                y = [None, None]
                for result in results.multi_hand_landmarks[0].landmark:
                    if x[0] is None or result.x < x[0]:
                        x[0] = result.x
                    if x[1] is None or result.x > x[1]:
                        x[1] = result.x

                    if y[0] is None or result.y < y[0]:
                        y[0] = result.y
                    if y[1] is None or result.y > y[1]:
                        y[1] = result.y

                # Check if the current time is less than one-third of the time elapsed since the start
                if curr_time < round((time.time() - start_time) / 3, 1) and x[0] is not None:
                    curr_time = round((time.time() - start_time) / 3, 1)
                    try:
                        # Get landmark distances from a function
                        test_image = get_landmark_distances(results, x[1] - x[0], y[1] - y[0])

                        # Predict a class label using a machine learning classifier
                        test_pred = np.argmax(clf.predict_proba(np.array([test_image])))
                        test_probs = clf.predict_proba(np.array([test_image]))[0]
                        print("Predicted:", letters[test_pred], ", pred prob:", max(test_probs), ", current index:", easy_word_index, ", current time:", curr_time)
                        if max(test_probs) >= 0.8 or (max(test_probs) >= 0.6 and letters[test_pred] in ['p', 'r', 'u', 'v']):
                            pred_letter = letters[test_pred].upper()

                            # Check if the predicted letter matches the expected letter in the word
                            if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and (easy_word_index == 0 or easy_word[easy_word_index] != easy_word[easy_word_index - 1]):
                                easy_word_user += pred_letter
                                easy_word_index += 1
                                location = results.multi_hand_landmarks[0].landmark[0].x
                            if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and easy_word_index > 0 and easy_word[easy_word_index] == easy_word[easy_word_index - 1] and abs(location - results.multi_hand_landmarks[0].landmark[0].x) > 0.1:
                                easy_word_user += pred_letter
                                easy_word_index += 1
                                location = results.multi_hand_landmarks[0].landmark[0].x

                        if easy_word_user == easy_word:
                            time.sleep(0.5)
                            easy_word = words[int(random() * len(words))].upper()
                            easy_word_index = 0
                            easy_word_user = ''

                    except Exception as e:
                        print(e)

            try:
                # Check if 'letter_help' exists
                letter_help == 0
            except:
                # If not, set a portion of the frame to be 'letter_help'
                frame[5:5 + letter_help.shape[0], width - 5 - letter_help.shape[1]:width - 5] = frame[5:5 + letter_help.shape[0], width - 5 - letter_help.shape[1]:width - 5]

            return frame

    return frame


In [89]:
# Define a function named 'generate_frame' that generates frames from a camera stream.
def generate_frame():
    global easy_mode_enabled, medium_mode_enabled, camera

    # Create an infinite loop to continuously capture frames.
    while True:
        # Capture a frame from the camera.
        success, frame = camera.read()

        if success:
            if easy_mode_enabled:
                # Process the frame using the 'easy_mode' function.
                frame = process_easy_mode(frame)
            elif medium_mode_enabled:
                # Process the frame using the 'medium_mode' function.
                frame = process_medium_mode(frame)

            try:
                # Encode the frame as a JPEG image.
                ret, buffer = cv.imencode('.jpg', frame)
                frame = buffer.tobytes()

                # Yield the frame as a multipart HTTP response with the appropriate content type.
                yield (b'--frame\r\n'
                       b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n')
            except Exception as e:
                pass


In [None]:
# Import necessary libraries
from flask import Flask, render_template, Response, request

# Create a Flask web application
app = Flask(__name__)

# Define the route for the homepage ("/")
@app.route('/')
def index():
    # Render the "index.html" template and return it as a response
    return render_template("index.html")

# Define the route for the video feed ("/video_feed")
@app.route('/video_feed')
def video_feed():
    # Return a response that streams video frames using the "sign_frame" function
    return Response(sign_frame(), mimetype='multipart/x-mixed-replace; boundary=frame')

# Define the route for handling requests ("/requests") with POST and GET methods
@app.route('/requests', methods=['POST', 'GET'])
def mode():
    # Define global variables for different modes and initialize them
    global switch, easy_mode, medium_mode, hard_mode, free_mode
    if request.method == 'POST':
        # Check if the "easy" button was pressed
        if request.form.get('easy') == 'Easy':
            easy_mode = not easy_mode
            medium_mode, hard_mode, free_mode = 0, 0, 0
        # Check if the "medium" button was pressed
        elif request.form.get('medium') == 'Medium':
            medium_mode = not medium_mode
            easy_mode, hard_mode, free_mode = 0, 0, 0
        # Check if the "hard" button was pressed
        elif request.form.get('hard') == 'Hard':
            hard_mode = not hard_mode
            easy_mode, medium_mode, free_mode = 0, 0, 0
        # Check if the "freestyle" button was pressed
        elif request.form.get('free') == 'Freestyle':
            free_mode = not free_mode
            easy_mode, medium_mode, hard_mode = 0, 0, 0

    elif request.method == 'GET':
        # Render the "index.html" template for GET requests
        return render_template('index.html')

    # Always render the "index.html" template to display the current mode
    return render_template('index.html')

# Run the Flask application if this script is executed
if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
