In [1]:
!apt-get install -y ffmpeg libsm6 libxext6
!apt-get install -y tesseract-ocr
!apt-get install -y portaudio19-dev

!pip install streamlit
!pip install pyngrok
!pip install pydub
!pip install sounddevice
!pip install wavio
!pip install numpy
!pip install openai-whisper
!pip install PyAudio
!pip install SpeechRecognition

!pip install deep-translator
!pip install joblib
!pip install pandas
!pip install Pillow
!pip install praw
!pip install protobuf
!pip install pytesseract
!pip install Requests
!pip install scikit-learn
!pip install google-generativeai

!pip install librosa

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsm6 is already the newest version (2:1.2.3-1build2).
libxext6 is already the newest version (2:1.3.4-1build1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~

In [2]:
%%writefile app2.py

import cv2
import os
import streamlit as st
from pyngrok import ngrok
from io import BytesIO
import numpy as np
from PIL import Image
import pytesseract
import tempfile
import subprocess
from pydub import AudioSegment
import whisper
from deep_translator import GoogleTranslator
import librosa
import librosa.display
import tensorflow as tf

# Load the Whisper model
whisper_model = whisper.load_model("base")

# Function to extract 20 frames from a video file
def extract_frames(video_path, num_frames=20):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    frames = []
    frame_interval = total_frames // num_frames  # Calculate frame interval

    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * frame_interval)

        ret, frame = cap.read()

        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)

    cap.release()
    return frames

# Function to extract text from an image using Tesseract
def extract_text_from_image(image):
    extracted_text = pytesseract.image_to_string(image)
    return extracted_text if extracted_text else ""  # Return empty string if no text is found

# Function to extract audio from a video file and transcribe it
def transcribe_audio_from_video(video_file):
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file:
            temp_video_file.write(video_file.read())
            temp_video_path = temp_video_file.name

        audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

        subprocess.run(["ffmpeg", "-i", temp_video_path, "-q:a", "0", "-map", "a", audio_path, "-y"])
        audio = AudioSegment.from_file(audio_path)

        result = whisper_model.transcribe(audio_path)
        transcribed_text = result["text"]

        os.remove(temp_video_path)
        os.remove(audio_path)

        return transcribed_text

    except Exception as e:
        if "duration" in str(e).lower() or "length" in str(e).lower():
            return "The video is too long to process. Please upload a shorter video."
        else:
            return f"An error occurred: {e}"

# Function to translate text using DeepL
def translate_text(text, target_lang="en"):
    try:
        if text:
            translated_text = GoogleTranslator(source="auto", target=target_lang).translate(text)
            return translated_text
        return ""  # Return empty string if text is empty or None
    except Exception as e:
        return f"Error translating text: {str(e)}"

# Function to extract audio from a video file
def extract_audio_from_video(video_path):
    try:
        # Generate a temporary audio file path
        audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name

        # Use FFmpeg to extract audio from video
        subprocess.run(["ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a", audio_path, "-y"])

        # Return the path of the extracted audio
        return audio_path

    except Exception as e:
        return f"Error extracting audio: {str(e)}"

# Function to analyze audio mood based on extracted audio
def analyze_audio_mood(video_path):
    try:
        # Extract audio from the video (assuming extract_audio_from_video is implemented)
        audio_path = extract_audio_from_video(video_path)

        # Load the audio file using librosa
        y, sr = librosa.load(audio_path)

        # Extract MFCCs (Mel-frequency cepstral coefficients) from the audio signal
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

        # Divide the MFCC array into 4 frequency bands and calculate scalar mean for each band

        # Low Frequencies: MFCC 0, 1, 2
        low_freq_mfcc = np.mean(mfcc[0:3], axis=1)
        mean_low = np.mean(low_freq_mfcc)  # Scalar mean for low frequencies

        # Mid-Low Frequencies: MFCC 3, 4
        mid_low_freq_mfcc = np.mean(mfcc[3:5], axis=1)
        mean_mid_low = np.mean(mid_low_freq_mfcc)  # Scalar mean for mid-low frequencies

        # Mid-High Frequencies: MFCC 5, 6, 7
        mid_high_freq_mfcc = np.mean(mfcc[5:8], axis=1)
        mean_mid_high = np.mean(mid_high_freq_mfcc)  # Scalar mean for mid-high frequencies

        # High Frequencies: MFCC 8, 9, 10, 11, 12
        high_freq_mfcc = np.mean(mfcc[8:13], axis=1)
        mean_high = np.mean(high_freq_mfcc)  # Scalar mean for high frequencies

        # Now use these scalar means for classification

        if mean_high <= mean_low and mean_high <= mean_mid_low and mean_high <= mean_mid_high:
            return "Audio sounds normal, with no dominant emotion detected"

        elif mean_mid_high <= mean_low and mean_mid_high <= mean_mid_low and mean_mid_high <= mean_high:
            return "Audio sounds neutral, calm, or peaceful"

        elif mean_mid_low <= mean_low and mean_mid_low <= mean_mid_high and mean_mid_low <= mean_high:
            return "Audio sounds slightly melancholic or neutral"

        elif mean_low <= mean_mid_low and mean_low <= mean_mid_high and mean_low <= mean_high:
            return "Audio sounds calm or melancholic, with less intensity"

        elif mean_high > mean_low and mean_high > mean_mid_low and mean_high <= mean_mid_high:
            return "Audio sounds depressive or anxious in nature"

        else :
            return "Audio sounds upbeat and energetic (Happy)"

    except Exception as e:
        return f"Error analyzing audio mood: {str(e)}"


# Streamlit UI components
st.title("Video Frame Extractor & Audio Mood Analysis")
st.markdown("Upload a video and get 20 frames extracted along with transcribed text and audio mood analysis.")

# File upload widget
video_file = st.file_uploader("Choose a video file", type=["mp4", "mov", "avi"])

if video_file:
    # Save the uploaded video file temporarily
    video_path = "/tmp/uploaded_video.mp4"
    with open(video_path, "wb") as f:
        f.write(video_file.getbuffer())

    st.video(video_file)  # Display the uploaded video

    # Extract frames from the uploaded video
    frames = extract_frames(video_path)
    combined_text = ""

    for idx, frame in enumerate(frames):
        st.image(frame, caption=f"Frame {idx + 1}", use_column_width=True)
        text_from_frame = extract_text_from_image(frame)

        if text_from_frame and text_from_frame not in combined_text:
            combined_text += text_from_frame + "\n"

    st.write("Text Extracted from Video Frames:")
    st.text(combined_text)

    # Translate the extracted text from frames
    translated_frame_text = translate_text(combined_text)
    st.write("Translated Text from Video Frames:")
    st.text(translated_frame_text)

    # Extract audio and transcribe it
    st.write("Transcribing Audio from Video...")
    transcribed_audio_text = transcribe_audio_from_video(video_file)

    st.write("Transcribed Audio Text:")
    st.text(transcribed_audio_text)

    translated_audio_text = translate_text(transcribed_audio_text)
    st.write("Translated Audio Text:")
    st.text(translated_audio_text)

    # Combine the text extracted from both images and audio
    full_combined_text = combined_text + "\n" + transcribed_audio_text
    st.write("Combined Extracted Text (from both video frames and audio):")
    st.text(full_combined_text)

    translated_combined_text = translate_text(full_combined_text)
    st.write("Translated Combined Text (Frames + Audio):")
    st.text(translated_combined_text)

    # Analyze audio mood
    st.write("Analyzing Audio Mood...")
    mood_result = analyze_audio_mood(video_path)
    st.write(mood_result)


Writing app2.py


In [None]:
# Import ngrok
from pyngrok import ngrok

# Set your authtoken
ngrok.set_auth_token("<NGROK_AUTH_TOKEN>") # Replace YOUR_AUTHTOKEN with your actual authtoken

# Kill any existing ngrok processes
ngrok.kill()

# Start Streamlit with nohup
!nohup streamlit run app2.py &

# Create a public URL with ngrok to access the app
public_url = ngrok.connect(addr='8501')
print(f"Public URL: {public_url}")

nohup: appending output to 'nohup.out'
Public URL: NgrokTunnel: "https://11dd-35-243-137-45.ngrok-free.app" -> "http://localhost:8501"


In [4]:
ngrok.kill()