In [1]:
def preprocess_video_clip_VGG16(video_path):
    """
    Preprocess a video clip:
    - Extract frames
    - Resize frames to 224x224
    Args:
    - video_path: Path to the video clip
    Returns:
    - A list of preprocessed frames
    """
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    
    frames = []
    n =0
    # Read frames from the video
    while cap.isOpened():
        n = n+1
        ret, frame = cap.read()
        if not ret:
            break
        print(f"\r\t...extracting and resizing frame: {n}", end='', flush=True)
        
        # Resize the frame to 224x224
        resized_frame = cv2.resize(frame, (224, 224))
        
        frames.append(resized_frame)
    
    # Release the video capture object
    cap.release()
    print(" of", len(frames), "frames")
    
    print(f"\n\n   SUMMARY: \n\tThe clip {video_path} has been transformed in frames with the right format", flush=True)
    print(f"\t    Total frames created: {len(frames)}", flush=True)
    print(f"\t    Frame format: 'size=224x224', 'color=RGB'", flush=True)

    return frames

In [3]:
def prepare_single_clip_data_VGG16(clip_frames):
    """
    Prepare data for a single video clip using VGG16.
    
    Args:
    - clip_frames: List of frames from a single video clip.
    
    Returns:
    - A sequence of features for the given video clip.
    """
    
    print('\t...loading the VGG16 pre-trained model', flush=True)
    # Load VGG16 model without the top classification layers
    base_model = VGG16(weights='imagenet', include_top=False)
    feature_extractor = Model(inputs=base_model.input, outputs=base_model.layers[-1].output)
    
    clip_features = []  # List to store features for each frame

    for i, frame in enumerate(clip_frames):
        print(f"\r\t...resizing, converting to RGB, normalising and extracting features from frame: {i+1:3} of {len(clip_frames)} ", end='', flush=True)
        
        # Preprocess the frame for VGG16
        frame = preprocess_input(frame)  
        frame = np.expand_dims(frame, axis=0)   # Add batch dimension

        # Extract features
        feature_vector = feature_extractor.predict(frame)
        
         # Flatten the feature vector
        flattened_vector = np.reshape(feature_vector, (25088,))
        
        clip_features.append(flattened_vector)
        
    print('\n')
    print(f"\n   SUMMARY: \n\tThe {len(clip_frames)} frames have been transformed into a vector of features through the VGG16 model", flush=True)
    print(f"\t    Total vectors created: '1'", flush=True)
    print(f"\t    Total frames: '{np.array(clip_features).shape[0]}'", flush=True)
    print(f"\t    Total feature per frame: '{np.array(clip_features).shape[1]}' (as per VGG16 output)", flush=True)
    print(f"\t    Reality check: \n\t\t'Print shape of resulting array' = {np.array(clip_features).shape}", flush=True)
    return [clip_features]


In [54]:
def transcribe_audio(file_path, speech_recognition_lang):
    r = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio_data = r.record(source)
        try:
            # Set the language to Italian using the `language` parameter
            text = r.recognize_google(audio_data, language=speech_recognition_lang)
            print( '\t...transcritpion extracted as:', "'",text,"'")
            return text
        except sr.UnknownValueError:
            return "Google Speech Recognition could not understand audio"
        except sr.RequestError:
            return "API unavailable or unresponsive"

In [31]:

def extract_audio_from_video(video_path, audio_output_path):
    """
    Extracts audio from the given video and saves it as a .wav file using pydub.
    
    Parameters:
    - video_path: Path to the input video file.
    - audio_output_path: Path to save the extracted audio.
    """
    print('\t...extracting the audio from the video:', video_path)
    audio = AudioSegment.from_file(video_path, format="avi")
    audio.export(audio_output_path, format="wav")
    print('\t...audio saved successfully on a temp .wav file')
          


In [10]:
def convert_to_mp4(avi_path):
    """Converts the given .avi video to .mp4 format."""
    print('\t...converting in compatible format')
    mp4_path = avi_path.replace(".avi", ".mp4")
    os.system(f"ffmpeg -i {avi_path} {mp4_path}")
    return mp4_path

def display_video_clip(clip_path):
    """Displays the video clip in the notebook. Converts .avi to .mp4 if necessary."""
    if clip_path.endswith(".avi"):
        clip_path = convert_to_mp4(clip_path)
    print('\n')
    display(Video(clip_path))

In [7]:
def process_audio_from_video(video_path, speech_recognition_lang, translation_lang):
    """
    Processes the audio from the given video clip:
    1. Extracts audio from the video.
    2. Transcribes the audio.
    3. Detects the language of the transcription.
    4. Translates the transcription to English if it's not already in English.
    
    Parameters:
    - video_path: Path to the video file.
    
    Returns:
    - Transcription in English.
    """
    # Extract audio from the video
    audio_output_path = "temp_audio.wav"  # Temporary path to save the extracted audio
    extract_audio_from_video(video_path, audio_output_path)
    
    # Transcribe the audio and detect its language
    transcription = transcribe_audio(audio_output_path, speech_recognition_lang)
    detected_language = translation_lang
    # If the detected language is not English, translate the transcription to English
    if detected_language and detected_language != 'en':
        translator = Translator()
        translation = translator.translate(transcription, src=detected_language, dest='en').text
        print('\t... text translated as:', "'",translation,"'")
        res =  translation
    else:
        res = transcription
        
        # Delete the temporary audio file
    if os.path.exists(audio_output_path):
        print("\t...deleting temporary audio file", flush=True)
        os.remove(audio_output_path)
        
    
    while True:
        # Ask the user if they want to view the clip
        view_clip = input("\n\tDo you want to view the video clip for verification? (yes/no): ").lower()

        if view_clip == "yes":
            display_video_clip(video_path)
            input("\tPress Enter once you're done watching the video...")  # Pause execution
            break
        elif view_clip == "no":
            print("\tOkay, proceeding without displaying the video clip.")
            break
        else:
            print("Invalid response. Please answer with 'yes' or 'no'.")
        
    print(f"\n   SUMMARY: \n\tThe audio of the clip: {video_path} has been captured and translated", flush=True)
    print(f"\t    Input language: '{translation_lang}'", flush=True)
    print(f"\t    Translated language: 'en'", flush=True)

    return res


In [11]:
def preprocess_text(data):

    data=re.sub(r"(#[\d\w\.]+)", '', data)
    data=re.sub(r"(@[\d\w\.]+)", '', data)
    print("\t...'#' and '@' removed from the text", flush=True)
    
    with open('Models/tokenizer_new.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    print('\t...tokenizer:', tokenizer, 'has been loaded', flush=True)
    
     # Tokenize the text
    text_sequences = tokenizer.texts_to_sequences([data])
    print('\t...the text has been tokenized into sequences', flush=True)
    # Pad the sequences to the desired length
    padded_text_sequences = pad_sequences(text_sequences, maxlen=500)
    print("\t...the text has been padded with 'PRE' 500 characters")
    
    print(f"\n   SUMMARY: \n\tThe text has been sanitized and preprocessed to meet the model input requirements", flush=True)
    print(f"\t    Tokenizer used: '{tokenizer}' (as per trained model)", flush=True)
    print(f"\t    sequences shape '{padded_text_sequences.shape}' (as per trained model)", flush=True)
    print(f"\t    sequence example {padded_text_sequences[0]}", flush=True)
    
    return padded_text_sequences[0]

In [86]:
 def run_multi_input_VGG16(video_sequences, transcriptions, emotion_labels):
    print("Video Sequences",np.array(video_sequences).shape)
    print('\t...importing pre-trained rnn model for image stream input', flush=True)
    # Load the pre-trained CNN-LSTM model for video frames
    video_model_path = 'Models/best_rnn_model.h5'  # Replace with the correct path to your model
    video_model = load_model(video_model_path)
    
    print('\n')
    print('*'*70)
    video_model.summary()
    print('*'*70, '\n')
    input("\tPress Enter once you're done checking the model...")  # Pause execution
    print('\n\t...importing pre-trained rnn model for text stream input', flush=True)
    # Load the pre-trained text classifier model for audio transcription
    text_model_path = 'Models/new_textmodel.h5'  # Replace with the correct path to your model
    text_model = load_model(text_model_path)
    # Check the models' architectures
    print('\n')
    print('*'*70)
    text_model.summary()
    print('*'*70, '\n')
    input("\tPress Enter once you're done checking the model...")  # Pause execution
    print('\n\t...creating our Multi_Input model', flush=True)
    
    # Define the video input shape
    num_frames = None  # Variable number of frames
    feature_vector_size = 25088
    # Define the video input
    video_frames_input = Input(shape=(num_frames, feature_vector_size))
    video_output = video_model(video_frames_input)

    # Define the text input shape
    text_sequence_length = 500  # This is the max length you've used for padding

    # Define the text input
    text_input = Input(shape=(text_sequence_length,))
    text_output = text_model(text_input)

    # Compute the Cosine Similarity
    similarity = Dot(axes=1, normalize=True)([video_output, text_output])

    # Optionally, pass the similarity through a dense layer to get a value between 0 and 1
    similarity_score = Dense(1, activation='sigmoid')(similarity)

    # Define the multi-input model
    multi_input_model = Model(inputs=[video_frames_input, text_input], outputs=similarity_score, name="Our_MultiInputModel")

    # Compile the model
    multi_input_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    print('\n')
    print('*'*70)
    multi_input_model.summary()
    print('Activation: Sigmoid', '\nOptimizer: Adam', '\nLoss: loss: categorical_crossentropy', '\nSimilarity: Dot')
    print('*'*70, '\n')
    input("\tPress Enter once you're done checking the model...")  # Pause execution
    print('\n\t...predicting Similarity', flush=True)
    # Predict the similarity score
    # Assuming you have `sequence` and `tokenized_padded_text` ready
    
    # video_sequences = np.expand_dims(video_sequences, axis=0)  # Ensure it has shape (1, ...)
    transcriptions = np.expand_dims(transcriptions, axis=0)  # Ensure it has shape (1, ...)

    similarity_prediction = multi_input_model.predict([np.array(video_sequences), np.array(transcriptions)])
    
    print('\t...evaluating results', flush=True)
    # Define the thresholds and corresponding verbal levels
    thresholds = [0.2, 0.4, 0.6, 0.8]
    levels = ["Completely Unmatched", "Slightly Matched - ", "Moderately Matched", "Highly Matched", "Perfect Match"]

    # Determine the verbal level based on the similarity score
    level = levels[0]  # Default to "Completely Unmatched"
    for i, threshold in enumerate(thresholds):
        if similarity_prediction[0] > threshold:
            level = levels[i+1]
    print('\n')
    print('*'*70)
    print("    Predicted Similarity Score: {:.4f} - {}".format(similarity_prediction[0].item(), level))
    print('*'*70)
    