# CS 689A Project - Hindi To Indian Sign Language: Rule-Based Translation System
### Done by
1. Yashvir Singh Nathawat(231110059)
2. Karthik Jain (231110023)
3. Aditya Katare (231110005)

In [1]:
# import statements
import stanza
import nlp
import pandas as pd
import ast
import re

  from .autonotebook import tqdm as notebook_tqdm


# Input the sentence

In [2]:
sentence = input('Enter the hindi sentence : ')

Enter the hindi sentence :  मेरा भारत महान है।


# POS Tagging and Dependency Parsing - STANZA

In [3]:
# Import Stanza Hindi Pipeline
nlp = stanza.Pipeline('hi', processors='tokenize,lemma,pos,depparse')
# Known POS Tags
print(nlp.processors['pos'].get_known_xpos())
doc = nlp(sentence)
#print(doc)

2024-11-03 19:21:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 7.82MB/s]                    
2024-11-03 19:21:41 INFO: Downloaded file to C:\Users\Lenovo\stanza_resources\resources.json
2024-11-03 19:21:42 INFO: Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

2024-11-03 19:21:42 INFO: Using device: cpu
2024-11-03 19:21:42 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-03 19:21:43 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename,

['PSP', 'NN', 'VM', 'NNP', 'SYM', 'VAUX', 'JJ', 'NNPC', 'PRP', 'CC', 'NNC', 'QC', 'NST', 'DEM', 'RP', 'QF', 'NEG', 'RB', 'QCC', 'QO', 'INTF', 'JJC', 'WQ', 'RDP', 'UNK', 'PRPC', 'NSTC', 'RBC', 'QFC', 'CCC', 'INJ']


In [4]:
# Extract relevant fields from the data
df_data = [(token['text'], token['upos'], token['xpos'], token['head'], token['deprel']) for token in doc.to_dict()[0]]

# Create DataFrame
df = pd.DataFrame(df_data, columns=['text', 'upos', 'xpos', 'head', 'deprel'])
df

Unnamed: 0,text,upos,xpos,head,deprel
0,मेरा,PRON,PRP,2,nmod
1,भारत,PROPN,NNP,3,nsubj
2,महान,ADJ,JJ,0,root
3,है,AUX,VM,3,cop
4,।,PUNCT,SYM,3,punct


# Extracting Tags for each word

In [5]:
# Extract tags for hindi sentences
word_tag = []
for sent in doc.sentences:
    for word in sent.words:
        word_dict = {
            "text": word.text,
            "xpos": word.xpos,
            "id": word.id,
            "lemma": word.lemma,
            "deprel": word.deprel,
            "head": word.head
        }
        word_tag.append(word_dict)
#print(word_tag)

# Removing Unwanted Tags

In [6]:
# Removing Unwanted Tags - VAUX CC SYM
unwanted_tags = ['VAUX','CC','SYM','PSP']
word_tag_cleaned = {}
for rel_dict in word_tag:
    if rel_dict['xpos'] not in unwanted_tags:
        word_tag_cleaned[rel_dict['id']] =  rel_dict
#print(word_tag_cleaned)

In [7]:
for key,value in word_tag_cleaned.items():
    print(key,value)

1 {'text': 'मेरा', 'xpos': 'PRP', 'id': 1, 'lemma': 'मैं', 'deprel': 'nmod', 'head': 2}
2 {'text': 'भारत', 'xpos': 'NNP', 'id': 2, 'lemma': 'भारत', 'deprel': 'nsubj', 'head': 3}
3 {'text': 'महान', 'xpos': 'JJ', 'id': 3, 'lemma': 'महान', 'deprel': 'root', 'head': 0}
4 {'text': 'है', 'xpos': 'VM', 'id': 4, 'lemma': 'है', 'deprel': 'cop', 'head': 3}


# Grammer Rules based Reordering

1. Noun Verb

2. Assumption:  Subject always comes before object
    राम ने श्याम को कहा
    Subject Object Verb

3. Verb Auxiliary Verb

4. Verb Adverb

5. Noun Adjective

-----------------------------------------------------
#### Subjective S_Adjective Object O_Adjective Verb Adverb
-----------------------------------------------------
#### Arrange in Order : Pronoun Subjective S_Adjective Object O_Adjective -> Noun Adjective Verb Adverb

In [8]:
def change_dict_order(sample_dict,row_1,row_2):
    if row_1 == row_2:
        return sample_dict
    list_form = [(key,value) for key,value in sample_dict.items()]
    index_1 = None     # index_1 
    index_2 = None
    for i,content in enumerate(list_form):
        if content[0] == row_1:
            index_1 = i
        elif content[0] == row_2:
            index_2 = i
    entry_1 = list_form.pop(index_1)
    list_form.insert(index_2,entry_1)    
    return_dict = {}
    for key,value in list_form:
        return_dict[key] = value
    return return_dict

# Format Subject and Object

In [9]:
word_sign_form = word_tag_cleaned.copy()
# Format Subject then Object
subject_id = None
object_id = None
subject_index = None 
object_index = None
cnt = 0
for key,value in word_tag_cleaned.items():
    if value['deprel'] in ['obj','obl']:
        object_id = key
        object_index = cnt
    elif value['deprel']=='nsubj':
        subject_id = key
        subject_index = cnt
    cnt+=1
#print(subject_id,object_id)
if subject_id!=None and object_id!=None and subject_id>object_id:
    word_sign_form = change_dict_order(word_sign_form,object_id,subject_id)

In [10]:
for key,value in word_sign_form.items():
    print(key,value)

1 {'text': 'मेरा', 'xpos': 'PRP', 'id': 1, 'lemma': 'मैं', 'deprel': 'nmod', 'head': 2}
2 {'text': 'भारत', 'xpos': 'NNP', 'id': 2, 'lemma': 'भारत', 'deprel': 'nsubj', 'head': 3}
3 {'text': 'महान', 'xpos': 'JJ', 'id': 3, 'lemma': 'महान', 'deprel': 'root', 'head': 0}
4 {'text': 'है', 'xpos': 'VM', 'id': 4, 'lemma': 'है', 'deprel': 'cop', 'head': 3}


# Handling Adjective and Adverb

In [11]:
# Arrange Adjective and Adverb
for key,value in word_tag_cleaned.items():    # word_tag_cleaned
    if value['xpos'] in ['JJ']:     # Adjective Adverb
        # first condition is for when it does not have corresponding noun or verb - मैं खुश हूं।
        if value['head']!=0 and word_tag_cleaned[value['head']]['xpos'] in ['NN']:
            word_sign_form = change_dict_order(word_sign_form,key,value['head'])
    elif value['xpos'] in ['RB']:
        if value['head']!=0 and word_tag_cleaned[value['head']]['xpos'] in ['VM','VAUX']:
            word_sign_form = change_dict_order(word_sign_form,key,value['head'])
#print(word_sign_form)

# Handling Negative Sentences

In [12]:
# Arrange Negative Sentences
for key,value in word_sign_form.items():
    if value['xpos']=='NEG': 
        last_key = list(word_sign_form.keys())[-1]
        word_sign_form = change_dict_order(word_sign_form,key,last_key)

# Handling Interrogative Sentences

In [13]:
# Handling Interrogative Sentence
for key,value in word_sign_form.items():
    if value['xpos']=='WQ': 
        last_key = list(word_sign_form.keys())[-1]
        word_sign_form = change_dict_order(word_sign_form,key,last_key)

# StopWord Removal - For Sentences like मैं खुश हूं।
#### हूं - comes out to be VM

In [14]:
# Read stopwords from file
with open('./utility/final_stopwords.txt', 'r',encoding='utf8') as file:
    # Read the entire contents of the file
    stopword_list = file.readlines()
stopword_list = [word.strip() for word in stopword_list]

In [15]:
# StopWord Removal
stopword_removed_list = {}
for key,value in word_sign_form.items():
    #print(value['text'] in stopword_list)
    if value['text'] not in stopword_list:
        stopword_removed_list[key] = value

In [16]:
for key,value in stopword_removed_list.items():
    print(key,value)

1 {'text': 'मेरा', 'xpos': 'PRP', 'id': 1, 'lemma': 'मैं', 'deprel': 'nmod', 'head': 2}
2 {'text': 'भारत', 'xpos': 'NNP', 'id': 2, 'lemma': 'भारत', 'deprel': 'nsubj', 'head': 3}
3 {'text': 'महान', 'xpos': 'JJ', 'id': 3, 'lemma': 'महान', 'deprel': 'root', 'head': 0}


# Mapping xpos to POS Tag

In [17]:
# Mapping xpos to POS tags
xpos_to_pos = {
    'NNP': 'pnoun',
    'VM': 'verb',
    'VAUX': 'verb',
    'JJ': 'adjective',
    'RB': 'adverb',
    'PRP' : 'pronoun',
    'NEG' : 'negative',
    'NN' : 'noun',
    'RDP' : 'adverb',
    'QF': 'adjective',            # 'अधिक'\
    'WQ': 'wh_adverb',
    'NST': 'noun',
    'DEM': 'noun_refer_specific',
    'INTF': 'intensifier',
    # Add more mappings as needed
}

In [18]:
# Extract Words from Parser and corresponding tag
sign_words_list = []
for key,value in stopword_removed_list.items():
    if value['xpos'] in xpos_to_pos:
        sign_words_list.append((value['text'], xpos_to_pos[value['xpos']]))
    else:
        sign_words_list.append((value['text'], 'extra'))
print(sign_words_list)

[('मेरा', 'pronoun'), ('भारत', 'pnoun'), ('महान', 'adjective')]


# READ ISL Dictionary

In [19]:
# READ ISL Dictionary
# Open the file in read mode
with open('./utility/isl_dict.txt', 'r',encoding='utf8') as file:
    # Read the entire contents of the file
    isl_dict = ast.literal_eval(file.read())

In [20]:
## Use Case : Why_(Sign_2) should be why
# Remove content within parentheses and strip whitespace for keys containing "_(*)"
isl_dict = {re.sub(r'_\(.*\)', '', key).strip().lower(): value for key, value in isl_dict.items()}

In [21]:
# Create a new dictionary with lowercase keys
isl_dict = {key.lower(): value for key, value in isl_dict.items()}
isl_dict['school'] = 'स्कूल'

# Synonym Substitution

In [22]:
# Synonym Substitution
import pyiwn
from nltk.corpus import wordnet as wn
iwn = pyiwn.IndoWordNet() 
#print(dir(iwn))
#print(iwn.synsets('आम्र'))
#print(iwn.all_synsets)

2024-11-03:19:21:57,282 INFO     [iwn.py:43] Loading hindi language synsets...


In [23]:
# Google Translator
from googletrans import Translator
# Create a Translator object
translator = Translator()

In [24]:
# Handling Special Case for Mapping to Videos
special_videos = {
    'i' : '@D:\\desktop\\project\\Linguistic\\I\\I_Me.mp4',
    'who': '@D:\\desktop\\project\\Linguistic\\W\\Who_Whom.mp4',
    'whom': '@D:\\desktop\\project\\Linguistic\\W\\Who_Whom.mp4'
}

In [29]:
import spacy
lemmatizer = spacy.load("en_core_web_sm")

synonym_substituted_list = []
temp_list = [('आकलन', 'noun')]
for word,pos_tag in sign_words_list:

    # Translate the Hindi sentence to English
    english_word = translator.translate(word, src='hi', dest='en').text.lower()
    english_word_lemmatized = lemmatizer(english_word)[0].lemma_.lower()
    #print(word,english_word)
    
    if english_word_lemmatized in list(special_videos.keys()):
        synonym_substituted_list.append((word,pos_tag,special_videos[english_word_lemmatized]))
        continue
    
    if pos_tag == 'pnoun':
        synonym_substituted_list.append((word,pos_tag,english_word))
        continue
    #print(word,pos_tag)
    
    # Case 1 : Check hindi word in isl_dict
    if word in list(isl_dict.values()):
        synonym_substituted_list.append((word,pos_tag,english_word))
        continue
    all_hindi_synsets = []
    # Case 2 : Check synonym of hindi_word in isl_dict
    try:
        all_hindi_synsets = iwn.synsets(word)
    except Exception as e:
        pass
    flag = False
    for synset in all_hindi_synsets:
        if synset._head_word in list(isl_dict.values()):
            corresponding_keys = [key for key, value in isl_dict.items() if value == synset._head_word]
            synonym_substituted_list.append((word,pos_tag,corresponding_keys[0]))
            flag = True
            break
    if flag == True:
        continue
                
    # Case 3 : Check english word in isl_dict
    if english_word in list(isl_dict.keys()):
        synonym_substituted_list.append((word,pos_tag,english_word))
        continue

    # Case 4 : Check lemmatized english word in isl_dict
    if english_word_lemmatized in list(isl_dict.keys()):
        synonym_substituted_list.append((word,pos_tag,english_word_lemmatized))
        continue

    # Case 5 : Check syno of english word in isl_dict
    all_english_synsets = wn.synonyms(english_word)
    #print(all_english_synsets)
    all_english_synsets_flatten = []
    for row in all_english_synsets:
        all_english_synsets_flatten.extend(row)
    flag = False
    for synset in all_english_synsets_flatten:
        if synset.lower() in list(isl_dict.keys()):
            flag = True
            # print('Yes Present')
            synonym_substituted_list.append((word,pos_tag,synset.lower()))
            break
    if flag == True:
        continue


    # Case 6  : Nothing Words Go for Finger Spelling
    synonym_substituted_list.append((word,pos_tag,'#'))
print(synonym_substituted_list)

[('मेरा', 'pronoun', 'my'), ('भारत', 'pnoun', 'india'), ('महान', 'adjective', 'neat')]


In [30]:
# Final ISL List
final_isl_list = synonym_substituted_list.copy()
# Create DataFrame
df = pd.DataFrame(final_isl_list, columns=['Hindi Word', 'POS Tag', 'ISL Dictionary Tag'])
df

Unnamed: 0,Hindi Word,POS Tag,ISL Dictionary Tag
0,मेरा,pronoun,my
1,भारत,pnoun,india
2,महान,adjective,neat


# Video Translation

In [31]:
# Reversed dictionary mapping Hindi words to English words
isl_hindi_english_dict = {hindi_word: english_word for english_word, hindi_word in isl_dict.items()}
#print(isl_hindi_english_dict)

In [32]:
# Mapping of Devanagari vowel signs to their vowels
sign_mapping_vowels = {
    'ा': 'आ',  # Aa
    'ि': 'इ',  # I
    'ी': 'ई',  # II
    'ु': 'उ',  # U
    'ू': 'ऊ',  # UU
    'ृ': 'ऋ',  # R
    'े': 'ए',  # E
    'ै': 'ऐ',  # AI
    'ो': 'ओ',  # O
    'ौ': 'औ',  # AU
    'ं': 'अं', # Anusvara
    'ः': 'अः'  # Visarga
}

In [37]:
import os
import subprocess
import time

def search_videos(folder_path, final_isl_list):
    """
    Searches for video files named after the provided words in a directory.

    Args:
        folder_path: Path to the directory containing video files.
        words: A list of words to search for (video names).

    Returns:
        A list of paths to the found video files.
    """

    found_videos = []
    # Assuming `words` is a list of tuples like [(or_word1, pos_tag1, isl_word1), (or_word2, pos_tag2, isl_word2), ...]
    for or_word, pos_tag, isl_word in final_isl_list:
        video_name = f"{isl_word.capitalize()}.mp4"
        if pos_tag == 'pnoun':  # Alphabets and FingerSpell
            for letter in isl_word:
                video_name = f"{letter.capitalize()}.mp4"
                for root, dirs, files in os.walk(folder_path):
                    full_path = os.path.join(root, video_name)
                    if os.path.isfile(full_path):
                        found_videos.append(full_path)
        elif isl_word=='#':
            for char in or_word:
                if char in sign_mapping_vowels.keys():
                    video_name = f"{sign_mapping_vowels[char]}.mp4"
                else:
                    video_name = f"{char}.mp4"
                for root, dirs, files in os.walk(folder_path):
                    full_path = os.path.join(root, video_name)
                    if os.path.isfile(full_path):
                        found_videos.append(full_path)
        elif isl_word[0]=='@':         # Special Words are handled here
            found_videos.append(isl_word[1:])
        else:
            for root, dirs, files in os.walk(folder_path):
                full_path = os.path.join(root, video_name)
                if os.path.isfile(full_path):
                    found_videos.append(full_path)

    return found_videos


def play_videos(video_paths):
    vlc_path = r'C:\Program Files\VideoLAN\VLC\vlc.exe'  # Path to VLC media player executable
    for video_path in video_paths:
        subprocess.Popen([vlc_path, '--fullscreen', video_path])
        time.sleep(5)  # Delay before playing the next video

# Set Folder Path where ISL videos are there

In [42]:
# Set it accordingly
folder_path = 'D:\projects\Linguistic-Videos'

In [43]:
video_paths = search_videos(folder_path, final_isl_list)
print(video_paths)

['D:\\projects\\Linguistic-Videos\\M\\My.mp4', 'D:\\projects\\Linguistic-Videos\\Alphabets\\I.mp4', 'D:\\projects\\Linguistic-Videos\\Alphabets\\N.mp4', 'D:\\projects\\Linguistic-Videos\\Alphabets\\D.mp4', 'D:\\projects\\Linguistic-Videos\\Alphabets\\I.mp4', 'D:\\projects\\Linguistic-Videos\\Alphabets\\A.mp4', 'D:\\projects\\Linguistic-Videos\\N\\Neat.mp4']


In [45]:
from moviepy.editor import VideoFileClip, concatenate_videoclips
vlc_path = r'C:\Program Files\VideoLAN\VLC\vlc.exe' 
def merge_videos(video_paths):
    clips = [VideoFileClip(path) for path in video_paths]
    final_clip = concatenate_videoclips(clips, method="compose")
    return final_clip

# Merge the videos into a single video
merged_clip = merge_videos(video_paths)


# Export the merged video to a file
merged_clip.write_videofile("merged_video.mp4",fps=24)

# Play the merged video
subprocess.Popen([vlc_path, '--fullscreen', 'merged_video.mp4'])

Moviepy - Building video merged_video.mp4.
MoviePy - Writing audio in merged_videoTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video merged_video.mp4





TypeError: must be real number, not NoneType

# Hope You Like the project. :D