# American Sign Language Recognition and Speech Generation


## Import Necessary Modules 

In [2]:
import cv2
import os
import numpy as np
import time
import statistics
import random

#tkinter for GUI
import tkinter as tk
from tkinter import ttk # To style the widgets
from PIL import ImageTk
from PIL import Image, ImageDraw, ImageFont


import pyttsx3 #text to speech
import imageio #for text2asl

#For ASL Fingerspelling
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# From .py files in local drive
from data import DataSet
from extractor import Extractor


# for Video ASL
from keras.layers import Dense, Flatten, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from collections import deque
from subprocess import call
from natsort import realsorted, ns
import natsort
import sys
sys.path.insert(0,'C:/Python/Python37/Scripts/New GUI') #Change this

Using TensorFlow backend.


## Helper functions

In [12]:
def open_file():
    filename = "ASL RecogSys Help file.pdf"
    os.startfile(filename)
    
def get_camera(PersonView):
    PersonView = cv2.flip(PersonView,1)
    #start_point (x1,y1) and end_point (x2,y2)
    y1 = 80
    y2 = y1 + 300
    x1 = 150
    x2 = x1 + 300
    
    HandView = PersonView[y1-1:y2+1, x1-1:x2+1] 
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale  = 1
    fontColor = (255,255,255)
    lineType = 2
    cv2.putText(PersonView,'Press space to record!!', (10,40), font, fontScale,fontColor,lineType)
    cv2.rectangle(PersonView,(x1,y1),(x2,y2),(50,50,50),1) 
    cv2.imshow("Person View", PersonView) 
    return HandView

def create_video(video_location):
    frame_count = 0
    # Create a black image
    
    cap = cv2.VideoCapture(0)
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(video_location,fourcc, 20.0, (302,302))

    if cap.isOpened(): # try to get the first frame
            ret, PersonView = cap.read()
    else:
        ret = False

    while(ret):
        ret, PersonView = cap.read()
        HandView = get_camera(PersonView)
        start_key = cv2.waitKey(5) & 0xFF
        if start_key==32:
            cv2.waitKey(3)
        while start_key == 32: #Press space to record video
            frame_count += 1
            ret, PersonView = cap.read()
            if ret == True:
                HandView = get_camera(PersonView)
                out.write(HandView)
                if frame_count == 40:
                    black_img = np.zeros((400,400,3), np.uint8)
                    font = cv2.FONT_HERSHEY_SIMPLEX
                    fontScale  = 1
                    fontColor = (255,255,255)
                    lineType = 2
                    cv2.putText(black_img,'Recording Stopped!!', (10,100), font, fontScale,fontColor,lineType)
                    cv2.putText(black_img,'Press Q to quit',(10,200), font, fontScale,fontColor,lineType)
                    cv2.imshow("InfoBox",black_img)
                    break
                exit_key = cv2.waitKey(5) & 0xFF
                if exit_key == 27:
                    break

        if start_key == ord("q"): #Press Q to Quit
                break

    # Release everything if job is finished
    cap.release()
    out.release()
    cv2.destroyAllWindows()

def load_ASL_fingerspell_model():
    vgg = VGG16(input_shape=(224,224,3), weights='imagenet', include_top=False)
    for layer in vgg.layers:
        layer.trainable = False

    x = Flatten()(vgg.output)
    predictions = Dense(units=27,activation ="softmax")(x)
    model = Model(inputs=vgg.input,outputs=predictions)
    model.compile(loss='categorical_crossentropy', 
                  optimizer= "adam", 
                  metrics=['accuracy'])
    weight_path = 'asl_fig_model-bestWeights_vacc.h5'
    model.load_weights(weight_path)
    return model

def load_asl_video_model():
    model = Sequential()
    model.add(LSTM(2048,return_sequences=False,
                           input_shape=(40,2048),
                           dropout=0.5))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam',
                               metrics=['accuracy'])
    model.load_weights('asl_vid_best_model_valloss.hdf5')
    return model


## Function for ASL Fingerspelling Window

In [4]:
def asl_fin_mode():
    model = load_ASL_fingerspell_model()
    alpha_lst = []
    cap = cv2.VideoCapture(0)
    img_counter = 0
    start_time = time.time()
    black_img = np.zeros((400,900,3), np.uint8)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale  = 0.8
    fontColor = (255,255,255)
    lineType = 1
    i_initial = 10
    j_initial = 100  
    i = i_initial
    j = j_initial
    while cap.isOpened():   
        ret, PersonView = cap.read()
        PersonView = cv2.flip(PersonView,1)
        #start_point (x1,y1) and end_point (x2,y2)
        y1 = 78
        y2 = 300
        x1 = 378
        x2 = 600

        HandView = PersonView[y1-1:y2+1, x1-1:x2+1] 
        cv2.rectangle(PersonView,(x1,y1),(x2,y2),(255,0,0),1) 
        cv2.imshow("Sign Window", HandView)
        k = cv2.waitKey(5)
        if k == 32:
            break

    while ret:   
        ret, PersonView = cap.read()
        PersonView = cv2.flip(PersonView,1)

        #start_point (x1,y1) and end_point (x2,y2)
        y1 = 78
        y2 = 300
        x1 = 378
        x2 = 600

        HandView = PersonView[y1-1:y2+1, x1-1:x2+1] 
        cv2.rectangle(PersonView,(x1,y1),(x2,y2),(255,0,0),1) 

        cv2.imshow("Sign Window", HandView)

        k = cv2.waitKey(5)
        if k == 27:
            break
        if time.time() - start_time >= 6: #<---- Check if 5 sec passed
            img_name = "Signs/sign.jpg"
            cv2.imwrite(img_name, HandView)
            #rmbg = RemoveBg("NcvjMgNWjsfr4H6WC2YymT8w", "error.log")
            #rmbg.remove_background_from_img_file("Signs/sign.jpg",bg_color="black")
            test_image = image.load_img("Signs/sign.jpg", 
                                color_mode ='rgb',
                                target_size = (224, 224))
            test_image = image.img_to_array(test_image)
            test_image = np.expand_dims(test_image, axis = 0)
            result = model.predict(test_image)
            map_characters= {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S', 19: 'T', 20: 'U', 21: 'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z', 26: '-'}
            res = np.argmax(result)
            alpha_lst.append(map_characters[res])
            
            engine = pyttsx3.init() 
            cv2.putText(black_img,"ASL finger spelling:",(10,50), font, fontScale,fontColor,lineType)
            cv2.putText(black_img,map_characters[res],(i,j), font, fontScale,fontColor,lineType)
            cv2.imshow("ASL to text",black_img)
            
            engine.say(map_characters[res]) 
            engine.runAndWait() 
            
            start_time = time.time()
            i = i+40
            if i >= 750:
                i = i_initial
                j = j_initial + 30
                j_initial = j

        img_counter += 1
        
    cv2.destroyAllWindows()
    cap.release()

## Function for Video ASL

In [5]:
def asl_video():
    black_img = np.zeros((400,400,3), np.uint8)
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale  = 0.8
    fontColor = (255,255,255)
    lineType = 1
    extractor_model = Extractor()
    create_video("test.avi")
    src = "test.avi"
    dest = os.path.join("test", "frames","%1d.jpg")
    call(["ffmpeg", "-i", src,"-vf","fps=20", dest])

    sequence = []
    i = 0
    sequence_path = "C:/Python/Python37/Scripts/New GUI/test/sequences/data_final.npy"
    os.chdir("C:/Python/Python37/Scripts/New GUI/test/frames")
    sequence = []
    files = [f for f in os.listdir('.') if os.path.isfile(f)]
    sorted_files = natsort.natsorted(files,reverse=False)
    for f in sorted_files:
        i = i + 1
        features = extractor_model.extract(f)
        sequence.append(features)
        if i==40:
            break

    np.save(sequence_path, sequence)
    #loading numpy array into memory
    sequences = np.load("C:/Python/Python37/Scripts/New GUI/test/sequences/data_final.npy")
    
    os.chdir('C:/Python/Python37/Scripts/New GUI')
    prediction = vid_model.predict(np.expand_dims(sequences, axis=0))

    data = DataSet(seq_length=40, class_limit=None)

    engine = pyttsx3.init() 
    result = data.print_class_from_prediction(np.squeeze(prediction, axis=0))
    cv2.putText(black_img,"ASL Word",(10,50), font, fontScale,fontColor,lineType)
    cv2.putText(black_img,result,(10,100), font, fontScale,fontColor,lineType)
    cv2.imshow("ASL word to text",black_img)
    engine.say(result) 
    engine.runAndWait() 

## App Class

In [6]:

# main class ---> app 
# Child class of tk.Tk class as it is an app.

class ASL_RecogSys(tk.Tk):
    # self ---> points towards the object so that an object can use variables and methods of a class 
    # Eg: object_name.variable_name or object_name.method(...)
    # *args --> as many arguments/parameters as you like
    # **kwargs --> keyword arguments (dictionaries)
    
    def __init__(self,*args,**kwargs):
        tk.Tk.__init__(self,*args,**kwargs) #Initializing tkinter

        container = tk.Frame(self) #Frame ---> Window
        container.pack(side = "top", fill = "both", expand = True)
        container.grid_rowconfigure(0,weight = 1) 
        container.grid_columnconfigure(0,weight = 1)
        
        label_style = ttk.Style()
        label_style.configure("my.TLabel",background = deep_sky_blue, foreground = "Black",relief = "flat")

        button_style = ttk.Style()
        button_style.theme_use("vista")
        button_style.configure('my.TButton',foreground = "dark blue",padding= 10, relief="flat", font= BUTTON_FONT)
        
        
        # Store the pages in the application inside a dictionary called frames
        self.frames = dict() 
        
        #Format
        frame_names = (StartPage,
                       MainMenu,
                       Credits,
                       TextToASL) #Will be the name of the windows that will be separate classes
        
        for Fclass in frame_names:
            frame = Fclass(container,self)
            self.frames[Fclass] = frame
            frame.grid(row = 0, column = 0, sticky = "nsew") #"nsew" ---> Strech everything to window size

        self.show_frame(StartPage)
 
        
    def show_frame(self,cont):
        frame = self.frames[cont]
        frame.tkraise() # Show page with the help of page_name class as a key

## Window 1

In [7]:
# First window
# Child class of tk.Frame as it is a window
class StartPage(tk.Frame): 
    def __init__(self,parent,controller):
        tk.Frame.__init__(self,parent)
        image1 = tk.PhotoImage(file="images/BG_StartPage.png")
        panel1 = tk.Label(self, image=image1)
        panel1.pack(side='top', fill='both', expand=True)
        panel1.image = image1
        

       
        x_pos = 0.25
        y_pos = 0.75
        x_inc = 0.5
        y_inc = 0.15
        button_width = 250
        button_height = 50
        
        button1 = ttk.Button(self,text = "To Main Menu",style = 'my.TButton',
                            command = lambda: controller.show_frame(MainMenu) )
        
        button1.place(relx = x_pos,  #fraction of width for button x_position
                      rely = y_pos, #fraction of height for button y_position
                      width = button_width,
                      height = button_height,
                      anchor = tk.CENTER)#Button  position
        
        button2 = ttk.Button(self,text = "Help",style = 'my.TButton', command = open_file)
        button2.place(relx = x_pos + x_inc, 
                      rely = y_pos, 
                      width =  button_width,
                      height = button_height,
                      anchor = tk.CENTER)
        
        button3 = ttk.Button(self,text = "Credits",style = 'my.TButton',command = lambda: controller.show_frame(Credits))
        button3.place(relx = 0.5, 
                      rely = y_pos + y_inc, 
                      width =  button_width,
                      height = button_height,
                      anchor = tk.CENTER)
        
    

## Window 1.1 

In [8]:
class MainMenu(tk.Frame): 
    def __init__(self,parent,controller):
        tk.Frame.__init__(self,parent)
        image1 = tk.PhotoImage(file="images/BG_MainMenu.png")
        panel1 = tk.Label(self, image=image1)
        panel1.pack(side='top', fill='both', expand=True)
        panel1.image = image1
    
        x_pos = 0.25
        y_pos = 0.65
        x_inc = 0.5
        y_inc = 0.15
        button_width = 300
        button_height = 50
   
      
        button1 = ttk.Button(self,text = "ASL Fingerspelling Mode",style = 'my.TButton',
                    command = asl_fin_mode)
        
        button1.place(relx = x_pos,  #fraction of width for button x_position
                      rely = y_pos, #fraction of height for button y_position
                      width =  button_width,
                      height = button_height,
                      anchor = tk.CENTER)#Button  position
        
        button2 = ttk.Button(self,text = "Text to ASL Mode",style = 'my.TButton',
                    command = lambda: controller.show_frame(TextToASL) )
        button2.place(relx = x_pos + x_inc,  
                      rely = y_pos,    
                      width =  button_width,
                      height = button_height,
                      anchor = tk.CENTER)
        
        
        button3 = ttk.Button(self,text = "ASL words",style = 'my.TButton',command = asl_video)
        
        button3.place(relx = x_pos,  
                      rely = y_pos + y_inc,
                      width =  button_width,
                      height = button_height,
                      anchor = tk.CENTER)\
        
        
        button4 = ttk.Button(self,text = "Back",style = 'my.TButton',
                            command = lambda: controller.show_frame(StartPage) )
        button4.place(relx = x_pos + x_inc,  
                      rely = y_pos + y_inc,
                      width =  button_width,
                      height = button_height,
                      anchor = tk.CENTER)

## Window 1.2

In [9]:
class Credits(tk.Frame): 
    def __init__(self,parent,controller):
        tk.Frame.__init__(self,parent)
        image1 = tk.PhotoImage(file="images/BG_Credits.png")
        panel1 = tk.Label(self, image=image1)
        panel1.pack(side='top', fill="both", expand='yes')
        panel1.image = image1    
        
    

        mytext=tk.Text(self,bg= "Black",foreground = "White",width=40,height=15,font = ("Courier New",18))
        mytext.place(relx = 0.5,
                     rely = 0.53,
                     anchor = tk.CENTER)
        mytext.insert('end', "\n\n    Amrita Thakur        (073BEX405)  \n\n")
        mytext.insert('end', "    Pujan Budhathoki     (073BEX428)  \n\n")      
        mytext.insert('end', "    Sarmila Upreti       (073BEX439)  \n\n")
        mytext.insert('end', "    Shirish Shrestha     (073BEX440)  \n\n\n")
        mytext.insert('end', "           Supervised by:  \n") 
        mytext.insert('end', "        Prof. Dr. Subarna Shakya  ") 
        mytext.insert('end',"\n\n Thanks for using this application!!\n\n")
        mytext.configure(state='disabled')
       
        button1 = ttk.Button(self,text = "Back",style = 'my.TButton',
                            command = lambda: controller.show_frame(StartPage) )
        button1.place(relx = 0.5,  
              rely = 0.92, 
              width =  200,
              height = 50,
              anchor = tk.CENTER)
        

## Window 1.1.2

In [10]:
class TextToASL(tk.Frame):
    
    def __init__(self,parent,controller):
        global entry
        tk.Frame.__init__(self,parent)
        image1 = tk.PhotoImage(file="images/BG_T2ASL.png")
        panel1 = tk.Label(self, image=image1)
        panel1.pack(side='top', fill="both", expand='yes')
        panel1.image = image1
        
        x_pos = 0.30
        y_pos = 0.60
        y_inc = 0.15
        x_inc = 0.35
        
        label = ttk.Label(self,text = "Enter Text:", style = "my.TLabel",font = LABEL_FONT)
        label.place(relx = x_pos,
                    rely = y_pos, 
                    anchor = tk.CENTER)
        
       
        entry = ttk.Entry(self,font = LABEL_FONT)
        entry.place(relx = x_pos + x_inc,  
                      rely = y_pos,
                      width = 300, 
                      height = 30, 
                      anchor = tk.CENTER)
        

        button1 = ttk.Button(self,text = "Okay",style = 'my.TButton',
                            command = self.generate_ASLgif )
        button1.place(relx = 0.5,  
                      rely = y_pos + y_inc, 
                      width = 100,
                      height = 50, 
                      anchor = tk.CENTER)

        
        button2 = ttk.Button(self,text = "Back to Main Menu",style = 'my.TButton',
                            command = lambda: controller.show_frame(MainMenu) )
        button2.place(relx = 0.5,  
                      rely = y_pos + 2 * y_inc,
                      width = 200,
                      height = 50, 
                      anchor = tk.CENTER)
        

        
    # Used in TextToASL class
    def generate_ASLgif(self):
        global input_str
        input_str = entry.get()
        str_list = input_str.split()

        #Black image
        image_height = 224
        image_width = 224
        black_img = Image.new("RGB", (image_height, image_width), 'black')

        font = ImageFont.truetype("cour.ttf", 30)
        images = []

        folder_path = "images/ASL_Images"
        for i in range(len(str_list)):
            work_str = str_list[i].upper()


            # Text on the black screen as separator
            draw = ImageDraw.Draw(black_img)
            draw.text((image_width/4,image_height/2-70),str_list[i],(255,255,255),font = font)

            # Append black image
            images.append(black_img)

            for element in work_str:
                # Go through folders and check for filename that matches the provided string
                for image_name in os.listdir(folder_path):

                    if image_name.startswith(element):
                        image_path = folder_path + '/' + image_name

                        # Converting image into PIL Image format
                        img = cv2.imread(image_path)
                        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                        im_pil = Image.fromarray(img)

                        # Append the PIL Image
                        images.append(im_pil)

            # Reset the black image again, to avoid overlapping between text            
            black_img = Image.new("RGB", (image_height, image_width), 'black')
            draw = ImageDraw.Draw(black_img)


        # Storing as a gif    
        filename = 'ASL-' + input_str + '.gif'
        path = 'images/ASL_gifs/' + filename
        images[0].save(path, format='GIF', append_images = images[1:], save_all=True, duration=1000,loop = 2)


        ## Read the gif from disk to `RGB`s using `imageio.miread` 
        gif = imageio.mimread(path)
        nums = len(gif)

        # convert form RGB to BGR 
        imgs = [cv2.cvtColor(img, cv2.COLOR_RGB2BGR) for img in gif]

        ## Display the gif
        i = 0

        while True:
            
            cv2.imshow('ASL-' + input_str, imgs[i])
            if cv2.waitKey(100)&0xFF == 27:
                break
            i = (i+1)%nums
            time.sleep(2)
            k = cv2.waitKey(5) & 0xFF
            if k == 27:
                break
        cv2.destroyAllWindows()


## Main Function

In [13]:
LABEL_FONT = ("Courier New",15)
BUTTON_FONT = ("Roboto",12)
alphabet_list = list()
deep_sky_blue = "#00BFFF"
SPECIAL_FONT = ("Courier New",14)
global vid_model
vid_model = load_asl_video_model()
app = ASL_RecogSys()
app.wm_title("ASL RecogSys")
app.wm_geometry("800x600")
app.resizable(False, False)
app.iconbitmap("images/asl_icon.ico")
app.mainloop()
