# 1. Import and Install Dependencies

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, LambdaCallback
from tensorflow.keras.optimizers import Adam
import pyttsx3
import threading
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip install pyttsx3

# 2. Keypoints using MP Holistic

In [2]:
# Initialize MediaPipe Hands solution
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Initialize NLU Model
model_NLU = TFAutoModelForSeq2SeqLM.from_pretrained("C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/fine_tuned_t5(3)/")
tokenizer = AutoTokenizer.from_pretrained("C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/fine_tuned_t5(3)/")





All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/fine_tuned_t5(3)/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [3]:
actions = np.array(['hello', 'good', 'morning', 'thank you', 'name', 'you', 'I', 'work', 'engineer', 'from', 'okay', 
                    'fine', 'mumbai', 'how', 'what', 'who', 'fullstop','no'])

In [5]:
from tensorflow.keras.optimizers import Adam
# Load pre-trained action recognition model (LSTM-based)
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(30, 126)))  # First LSTM layer returns sequences
model.add(LSTM(64, return_sequences=False))  # Second LSTM layer outputs only the final vector
model.add(Dense(64, activation='relu'))
model.add(Dense(18, activation='softmax'))  # Output layer with 19 classes
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Load model weights (use your actual model's path here)
model.load_weights('C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/model_epoch_24.h5')

In [6]:
def mediapipe_hand_detection(image, model):
    # Convert the image to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False  # Make image non-writeable to optimize processing
    results = model.process(image) # Process the image and detect hands
    image.flags.writeable = True   # Make image writeable again
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Convert back to BGR for OpenCV
    return image, results

In [7]:
def draw_hand_landmarks(image, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                image,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2),
            )

# 3. Extract Keypoint Values

In [8]:
import numpy as np

def extract_keypoints(results):
    # Initialize left and right hand keypoints as zero arrays
    lh = np.zeros(21 * 3)
    rh = np.zeros(21 * 3)

    # Check if hands are detected
    if results.multi_hand_landmarks and results.multi_handedness:
        for hand_index, hand_landmarks in enumerate(results.multi_hand_landmarks):
            # Determine if the hand is left or right
            hand_label = results.multi_handedness[hand_index].classification[0].label
            hand_array = np.array([[res.x, res.y, res.z] for res in hand_landmarks.landmark]).flatten()

            if hand_label.lower() == "left":
                lh = hand_array
            elif hand_label.lower() == "right":
                rh = hand_array

    # Return concatenated hand keypoints
    return np.concatenate([lh, rh])

# 4. Setup Folders for Collection

In [9]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('first_pov_DATA')

# Thirty videos worth of data (Number of sequences/videos for each action)
no_sequences = 100

# Videos are going to be 30 frames in length (frames per sequence)
sequence_length = 30

# 11. Test in Real Time

In [10]:
from scipy import stats

In [11]:
import random

colors = [tuple(random.randint(0, 255) for _ in range(3)) for _ in range(len(actions))]  # Random colors
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [12]:
# plt.figure(figsize=(18,18))
# plt.imshow(prob_viz(res, actions, image, colors))

In [13]:
# Function to translate unstructured sentences to structured sentences
def predict_sentence(model, sentence):
    inputs = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True)
    output = model.generate(inputs['input_ids'])
    predicted_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return predicted_sentence

In [14]:
speaker_engine = pyttsx3.init()

def initialize_speaker():
    speaker_engine.setProperty('rate', 150)  # Speed (default: 200 words per minute)
    speaker_engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

def text_to_speech(text):
    # Speak the text
    speaker_engine.say(text)
    speaker_engine.runAndWait()


# # Function for Text-to-Speech
# def text_to_speech(text):
#     # Initialize Text-to-Speech Engine
#     engine = pyttsx3.init()
#     engine.setProperty('rate', 150)  # Speed (default: 200 words per minute)
#     engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)
#     engine.say(text)
#     engine.runAndWait()

In [15]:
import time
import threading

# Initialize other variables
sequence = []
sentence = []
predictions = []
sign_sequence = []  # To store the complete sequence of signs
threshold = 0.7

last_prediction_time = 3
cooldown_period = 3  # in seconds

# TTS Lock (to prevent multiple threads accessing the TTS engine simultaneously)
tts_lock = threading.Lock()

# cap = cv2.VideoCapture(url, cv2.CAP_FFMPEG)
cap = cv2.VideoCapture(1)

# Define the function to process NLU and TTS
def nlu_and_tts(sign_sequence):
    # Ensure that TTS engine is not accessed by multiple threads
    with tts_lock:
        # Convert sequence into the desired format
        sign_sequence_str = ', '.join([f'"{word}"' for word in sign_sequence])  # Add inverted commas and comma

        # Print the input sentence (sign_sequence)
        print(f"Input sentence to NLU: {sign_sequence_str}")
        
        # Process NLU - convert sequence to structured sentence
        structured_sentence = predict_sentence(model_NLU, sign_sequence_str)
        
        # Print the structured output (corrected sentence)
        print(f"Structured sentence from NLU: {structured_sentence}")
        
        # Use text_to_speech to speak out the structured sentence
        text_to_speech(structured_sentence)

# Function to process NLU and TTS in a separate thread
def process_nlu_and_tts_thread(sign_sequence):
    # Ensure that the NLU and TTS logic runs in a separate thread
    threading.Thread(target=nlu_and_tts, args=(sign_sequence,)).start()

# Set mediapipe model
with mp_hands.Hands(static_image_mode=False, 
                    max_num_hands=2, 
                    min_detection_confidence=0.5, 
                    min_tracking_confidence=0.5) as hands:
    time.sleep(3)
    while cap.isOpened():
        # Read feed
        ret, frame = cap.read()
        if not ret:
            break

        # Make detections
        image, results = mediapipe_hand_detection(frame, hands)
        print(results)
        
        # Draw landmarks
        draw_hand_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            # 3. Viz logic
            current_time = time.time()

            if current_time - last_prediction_time > cooldown_period:  # Check cooldown
                if np.unique(predictions[-10:])[0] == np.argmax(res): 
                    if res[np.argmax(res)] > threshold: 

                        if len(sentence) > 0: 
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                                last_prediction_time = current_time  # Reset cooldown
                        else:
                            sentence.append(actions[np.argmax(res)])
                            last_prediction_time = current_time  # Reset cooldown

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
        
            # Check for "fullstop"
            if 'fullstop' in sentence:
                sign_sequence = sentence
                print(f"Detected sentence: {sign_sequence}")

                # Check if sign_sequence is correctly formed
                print(f"Debug - sign_sequence: {sign_sequence}")
                
                # Process NLU and TTS in a separate thread
                process_nlu_and_tts_thread(sign_sequence)
                
                # Clear sentence after fullstop for next sequence
                sentence = []
            else:
                # Debugging message: when fullstop is not detected
                print("Debug - Fullstop not detected yet.")
            
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
            
cap.release()
cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [16]:
cap.release()
cv2.destroyAllWindows()

In [None]:
# Start video capture (adjust to your camera)
cap = cv2.VideoCapture(2)

# Set up MediaPipe hands model
with mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image, results = mediapipe_hand_detection(frame, hands)

        # Draw hand landmarks
        draw_hand_landmarks(image, results)

        # Extract keypoints
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))

            current_time = time.time()

            # If fullstop is detected, trigger NLU and TTS
            if actions[np.argmax(res)] == 'fullstop' and (current_time - cooldown_nlu_time) > cooldown_period:
                sign_sequence = [actions[i] for i in predictions if actions[i] != 'fullstop']
                print(f"Sign sequence: {sign_sequence}")

                # NLU processing
                structured_sentence = predict_sentence(model_NLU, sign_sequence)
                print("Structured Sentence:", structured_sentence)

                # Text-to-Speech
                text_to_speech(structured_sentence)

                # Reset sentence and prediction list
                sentence = []
                predictions = []
                sign_sequence = []

                cooldown_nlu_time = current_time  # Set cooldown to prevent overloading

        # Show the final image
        cv2.imshow('OpenCV Feed', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()

In [18]:
cap.release()
cv2.destroyAllWindows()

In [13]:
import time

# New detection variables
sequence = []
sentence = []
predictions = []
sign_sequence = []  # To store the complete sequence of signs
threshold = 0.5

# Cooldown timer
last_prediction_time = 3
cooldown_period = 3  # in seconds

# cap = cv2.VideoCapture(url, cv2.CAP_FFMPEG)
cap = cv2.VideoCapture(2)

# Set mediapipe model
with mp_hands.Hands(static_image_mode=False, 
                    max_num_hands=2, 
                    min_detection_confidence=0.5, 
                    min_tracking_confidence=0.5) as hands:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        if not ret:
            break

        # Make detections
        image, results = mediapipe_hand_detection(frame, hands)
        print(results)
        
        # Draw landmarks
        draw_hand_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
        
        # 3. Viz logic
            current_time = time.time()

            if current_time - last_prediction_time > cooldown_period:  # Check cooldown
                if np.unique(predictions[-10:])[0] == np.argmax(res): 
                    if res[np.argmax(res)] > threshold: 

                        if len(sentence) > 0: 
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                                last_prediction_time = current_time  # Reset cooldown
                        else:
                            sentence.append(actions[np.argmax(res)])
                            last_prediction_time = current_time  # Reset cooldown

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [35]:
cap.release()
cv2.destroyAllWindows()

In [None]:
import time
import cv2
import numpy as np

# New detection variables
sequence = []
sentence = []
predictions = []
sign_sequence = []  # To store the complete sequence of signs
threshold = 0.5

# Cooldown timer
last_prediction_time = 3
cooldown_period = 5  # in seconds

# cap = cv2.VideoCapture(url, cv2.CAP_FFMPEG)
cap = cv2.VideoCapture(2)
res = np.zeros(len(actions)) 

# Set mediapipe model
with mp_hands.Hands(static_image_mode=False, 
                    max_num_hands=2, 
                    min_detection_confidence=0.5, 
                    min_tracking_confidence=0.5) as hands:
    # Wait for 3 seconds before starting detection
    print("Initializing feed... Waiting for 3 seconds...")
    time.sleep(3)

    while cap.isOpened():
        # Read feed
        ret, frame = cap.read()
        if not ret:
            break

        # Make detections
        image, results = mediapipe_hand_detection(frame, hands)
        print(results)
        
        # Draw landmarks
        draw_hand_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predicted_sign = actions[np.argmax(res)]
            print(f"Predicted Sign: {predicted_sign}")  # Display the current sign in the cmd
            predictions.append(np.argmax(res))

            # Add sign to sign_sequence if not repeated
            if res[np.argmax(res)] > threshold:
                if len(sign_sequence) == 0 or sign_sequence[-1] != predicted_sign:
                    sign_sequence.append(predicted_sign)

        # Check for full stop
        if "fullstop" in sign_sequence:
            print("Full stop detected. Final sequence:", sign_sequence)
            break  # Exit the loop upon detecting "fullstop"

        # 3. Viz logic
        current_time = time.time()

        if current_time - last_prediction_time > cooldown_period:  # Check cooldown
            if len(predictions) >= 10:  # Ensure there are at least 10 predictions
                if np.unique(predictions[-10:])[0] == np.argmax(res): 
                    if res[np.argmax(res)] > threshold: 

                        if len(sentence) > 0: 
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                                last_prediction_time = current_time  # Reset cooldown
                        else:
                            sentence.append(actions[np.argmax(res)])
                            last_prediction_time = current_time  # Reset cooldown

        if len(sentence) > 5: 
            sentence = sentence[-5:]

        # Viz probabilities
        image = prob_viz(res, actions, image, colors)
        
        # Show sentence on the screen
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Print the sentence in cmd
        print(f"Signed Text: {' '.join(sentence)}")  # Display the sentence in the cmd
        print("Current Sign Sequence:", sign_sequence)  # Display the sign sequence

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


In [26]:
cap.release()
cv2.destroyAllWindows()

In [176]:
print(f"Input shape for prediction: {np.expand_dims(sequence, axis=0).shape}")


Input shape for prediction: (1, 30, 126)


In [172]:
pip list


Package                      VersionNote: you may need to restart the kernel to use updated packages.

---------------------------- -----------
absl-py                      2.1.0
asttokens                    2.4.1
astunparse                   1.6.3
attrs                        24.2.0
cachetools                   5.5.0
certifi                      2024.8.30
cffi                         1.17.1
charset-normalizer           3.4.0
colorama                     0.4.6
comm                         0.2.2
contourpy                    1.3.0
cycler                       0.12.1
debugpy                      1.8.9
decorator                    5.1.1
exceptiongroup               1.2.2
executing                    2.1.0
filelock                     3.16.1
flatbuffers                  24.3.25
fonttools                    4.55.0
fsspec                       2024.10.0
gast                         0.6.0
google-auth                  2.36.0
google-auth-oauthlib         1.2.1
google-pasta                 0.2.0


You should consider upgrading via the 'C:\Users\zaeem\desktop\sih\sih\scripts\python.exe -m pip install --upgrade pip' command.


In [41]:
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration, TFAutoModelForSeq2SeqLM, AutoTokenizer

# Load your fine-tuned model and tokenizer
model = TFAutoModelForSeq2SeqLM.from_pretrained("C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/NLU_model")
tokenizer = AutoTokenizer.from_pretrained("C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/NLU_model")
model = T5ForConditionalGeneration.from_pretrained(nlu_model_path)

# Function to translate unstructured sentences to structured sentences
def predict_sentence(model, sentence):
    inputs = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True)
    output = model.generate(inputs['input_ids'])
    predicted_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return predicted_sentence

# Function to process the sign sequence and convert it to text
def process_sign_to_text(sign_sequence):
    # Assuming 'sign_sequence' is a list of actions predicted by the sign language model
    sentence = " ".join(sign_sequence)  # Join the actions into a sentence
    return sentence

# Step 1: Convert the sign sequence into a sentence
test_sentence = process_sign_to_text(sign_sequence)

# Step 2: Use the function to predict the structured sentence from the unstructured input
print("Original (from sign language model):", test_sentence)
predicted_sentence = predict_sentence(model, test_sentence)
print("Predicted Structured Sentence:", predicted_sentence)
print("Prediction completed.")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/NLU_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


ImportError: 
T5ForConditionalGeneration requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.


# NLU Integration 

In [97]:
!pip install transformers==4.20.0

Collecting transformers==4.20.0
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp39-cp39-win_amd64.whl (3.3 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
    Uninstalling tokenizers-0.21.0:
      Successfully uninstalled tokenizers-0.21.0
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.0
    Uninstalling transformers-4.47.0:
      Successfully uninstalled transformers-4.47.0
Successfully installed tokenizers-0.12.1 transformers-4.20.0


You should consider upgrading via the 'c:\users\zaeem\desktop\sih\sih\scripts\python.exe -m pip install --upgrade pip' command.


In [15]:
import tensorflow as tf
import transformers

print("Tensorflow version:", tf.__version__)
print("Transformers version:", transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


Tensorflow version: 2.15.1
Transformers version: 4.20.0


In [100]:
!pip install pandas==1.3.5


Collecting pandas==1.3.5
  Downloading pandas-1.3.5-cp39-cp39-win_amd64.whl (10.2 MB)
Collecting pytz>=2017.3
  Downloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.3.5 pytz-2024.2


You should consider upgrading via the 'c:\users\zaeem\desktop\sih\sih\scripts\python.exe -m pip install --upgrade pip' command.


In [16]:
import pandas as pd
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

In [17]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/NLU_model")
tokenizer = AutoTokenizer.from_pretrained("C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/NLU_model")




All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at C:/Users/zaeem/Desktop/sih/sih/ActionDetectionforSignLanguage/NLU_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [28]:
# Enable mixed precision
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [23]:
# Function to translate unstructured sentences to structured sentences
def predict_sentence(model, sentence):
    inputs = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True)
    output = model.generate(inputs['input_ids'])
    predicted_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return predicted_sentence

# Example usage
# Example prediction
test_sentence = sign_sequence
sent=predict_sentence(model, test_sentence)
print("Original:", test_sentence)
print("Predicted:", predict_sentence(model, test_sentence))

Original: ['who', 'I', 'okay', 'fullstop']
Predicted: Who is the recipient?
Prediction completed.


In [28]:
# Function to translate unstructured sentences to structured sentences
def predict_sentence(model, sentence):
    inputs = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True)
    output = model.generate(inputs['input_ids'])
    predicted_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return predicted_sentence

# Example usage
# Example prediction
test_sentence = 'I okay'
sent=predict_sentence(model, test_sentence)
print("Original:", test_sentence)
print("Predicted:", predict_sentence(model, test_sentence))
print("Prediction completed.")

Original: I okay
Predicted: I am okay.
Prediction completed.


# Text to Speech

In [20]:
pip install pyttsx3

Collecting pyttsx3
  Downloading pyttsx3-2.98-py3-none-any.whl (34 kB)
Collecting pypiwin32; platform_system == "Windows"
  Downloading pypiwin32-223-py3-none-any.whl (1.7 kB)
Collecting comtypes; platform_system == "Windows"
  Downloading comtypes-1.4.8-py3-none-any.whl (229 kB)
Installing collected packages: pypiwin32, comtypes, pyttsx3
Successfully installed comtypes-1.4.8 pypiwin32-223 pyttsx3-2.98
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\zaeem\desktop\sih\sih\scripts\python.exe -m pip install --upgrade pip' command.


In [24]:
import pyttsx3

def text_to_speech(text):
    # Initialize the text-to-speech engine
    engine = pyttsx3.init()
    
    # Optional: Customize speech properties
    engine.setProperty('rate', 150)  # Speed (default: 200 words per minute)
    engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)
    
    # Speak the text
    engine.say(text)
    engine.runAndWait()

# if _name_ == "_main_":
    # Example text input
if __name__ == "__main__":
    # Example text input
    text_to_speech(sent)