In [3]:
#############################
# audio_processor.py
#############################
import os
import whisper
from moviepy.audio.io.AudioFileClip import AudioFileClip
from utils import time_to_seconds
import re
import pickle
CACHE_FILE = "processed_result_cache.pkl"



def transcribe_with_timestamps(audio_path):
    """Convert audio to text with word-level timestamps using Whisper"""
    print("loading speech to text model ....    ")
    #model = whisper.load_model("medium")
    print("speech to text model running ....    ")

    ####################Cache the processed audio####################
    # Check if cached audio exists
    if os.path.exists(CACHE_FILE):
        print("Loading cached processed_audio...")
        with open(CACHE_FILE, "rb") as f:
            result = pickle.load(f)
    else:
        print("Processing audio and caching the result...")
        model = whisper.load_model("medium")
        result = model.transcribe(audio_path, fp16=False, word_timestamps=True)

    # Save to cache
    with open(CACHE_FILE, "wb") as f:
        pickle.dump(result, f)
    ###############################################################       
    return result["segments"]

def align_script_with_audio(script_path, audio_segments):
    """
    Aligns the original script with the audio transcription.
    This function reads a script from a file and aligns it with the provided audio segments.
    It ensures that each word in the script corresponds to a word in the audio segments.
    Args:
        script_path (str): The file path to the script.
        audio_segments (list): A list of dictionaries, where each dictionary represents an audio segment
                               and contains 'words' (a list of word dictionaries with 'word', 'start', and 'end' keys),
                               'text' (the transcribed text of the segment), 'start' (start time of the segment),
                               and 'end' (end time of the segment).
    Returns:
        list: A list of dictionaries, where each dictionary contains:
              - 'script_line' (str): A line from the script.
              - 'words' (list): A list of word dictionaries with 'text', 'start', and 'end' keys.
              - 'start' (float): The start time of the segment.
              - 'end' (float): The end time of the segment.
    Raises:
        ValueError: If the number of words in the audio segments does not match the number of words in the script.
    """
    with open(script_path, encoding="utf-8") as f:
        script_lines = [line.strip() for line in f.readlines()]
    
    # converting audio_segments into list of words ####
    audio_words = []
    for segment in audio_segments:
        audio_words += segment['words']
    ######################################################
    script = {}
    script["script_lines"] = script_lines
    script["script_len"] = len(script_lines)
    script["script_line_words"] = []
    script["script_all_words"] = []
    for text in script_lines:
        text = re.sub(r'[^a-zA-Z\s]', '', text) #removing all special characters
        script["script_all_words"] += re.findall(r"\b\w+(?:'\w+)?\b", text)
        script["script_line_words"].append(re.findall(r"\b\w+(?:'\w+)?\b", text))
        # todo: change the text in evry word object in segment ( later)

    # checking the number of audio_words and script_lines words
    if ( len(audio_words) == len( script['script_all_words']) ):
        #iterate through every segment word and replace with script word
        word_ind = 0
        # audio_segments
        for idx,segment in enumerate(audio_segments):
            if idx >= script['script_len']:
                audio_segments = audio_segments[0:idx]
                break
            
            segment['text'] = script['script_lines'][idx]
            segment['words'] = audio_words[
                word_ind:len(script['script_line_words'][idx]) + word_ind
            ]
            word_ind += len(script['script_line_words'][idx])
            # for seg_word in  script['script_line_words'][idx] :
            #     seg_word = script['script_line_words'][word_ind]
            #     word_ind+=1
            segment['start'] = segment['words'][0]['start']
            segment['end'] = segment['words'][-1]['end']
        
        

    else:
        print("#########Number of words in audio and script are not equal#########3")
        return

    aligned_data = []
    for idx, segment in enumerate(audio_segments):
        if idx >= len(script_lines):
            break
        words = []
        for word in segment['words']:
            words.append({
                'text': word['word'],
                'start': word['start'],
                'end': word['end']
            })
        
        aligned_data.append({
            'script_line': script_lines[idx],
            'words': words,
            'start': segment['start'],
            'end': segment['end']
        })
    
    return aligned_data

def process_audio(audio_path, script_path):
    """Main audio processing function"""
    print("Processing audio...")
    audio_segments = transcribe_with_timestamps(audio_path)
    aligned_data = align_script_with_audio(script_path, audio_segments)
    print("audio process is completed")
    return {
        'raw_audio': AudioFileClip(audio_path),
        'aligned_data': aligned_data
    }


In [28]:
#############################
# video_processor.py (Updated for MoviePy 2.1.1)
#############################
from moviepy.video.VideoClip import ImageClip, TextClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.video.compositing.CompositeVideoClip import concatenate_videoclips
import os

def create_image_clips(image_dir, aligned_data):
    """Create image clips with proper sequencing and duration"""
    images = sorted([os.path.join(image_dir, f) for f in os.listdir(image_dir)], 
                   key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    
    clips = []
    test_start = [0,4.56,9.82,6]
    test_durations = [4.2,4.58,2.44]
    for idx, segment in enumerate(aligned_data):
        if idx >= len(images):
            break
            
        # clip = ImageClip(images[idx]).with_start(test_start[idx],change_end=True)
        # clip = clip.with_duration(test_durations[idx])
        duration = segment['end'] - segment['start']        
        print(f"Segment {idx}: Start={segment['start']}, End={segment['end']}, Duration={duration}")
        clip = ImageClip(images[idx]).with_start(segment['start'],change_end=False)
        if(idx == len(aligned_data)-1):
            #final clip end with last segment end
            clip = clip.with_end(segment['end'])
        else:
            #clip end with next segment start
            clip = clip.with_end(aligned_data[idx+1]['start'])
        
        # clip = clip.with_duration(segment['end'] - segment['start'])
        print(f"Clip {idx} duration after assignment: {clip.duration}")
        print(f"Clip {idx} start  after assignment: {clip.start}")
        print(f"Clip {idx} end  after assignment: {clip.end}")


        # clip.preview(fps=24)
        # print("clip duration")
        # print(idx)
        # print(clip.duration)
        clips.append(clip)  #set_start(segment['start']))
    
    return concatenate_videoclips(clips, method="chain")#"chain")

def create_subtitles(aligned_data, sub_position):
    """Generate animated word-level subtitles"""
    subtitle_clips = []
    font_name = r"fonts\Arial.otf"
    for segment in aligned_data:
        words = segment['words']
        
        for word in words:
            txt_clip = TextClip(
                text=word['text'],
                font_size=70,
                font=font_name,
                color='white',
                stroke_color='black',
                stroke_width=2
            ).with_start(word['start']).with_duration(word['end'] - word['start'])
            
            txt_clip = txt_clip.with_position(('center', sub_position))
            # txt_clip.preview(fps=24)
            subtitle_clips.append(txt_clip)
    
    return subtitle_clips

def process_video(image_dir, script_path, audio_data, output_path, sub_position):
    """Main video processing function"""
    video_clip = create_image_clips(image_dir, audio_data['aligned_data'])
    # video_clip.preview(fps=24)
    subtitles = create_subtitles(audio_data['aligned_data'], sub_position)
    
    final_video = CompositeVideoClip([video_clip] + subtitles)
    # final_video.preview(fps=24)
    final_video = final_video.with_audio(audio_data['raw_audio'])
    # final_video.preview(fps=24)
    
    final_video.write_videofile(
        output_path,
        codec='libx264',
        audio_codec='aac',
        fps=24,
        threads=4,
        preset='fast'
    )




In [None]:
image_dir = "Test_data/images"
script_path = "Test_data/dronacharya killed/Script/script.txt"
audio_path = "Test_data/dronacharya killed/audio/script.mp3"
output = "final_video.mp4"
sub_pos = 30

processed_audio = process_audio(audio_path, script_path)
process_video(image_dir, script_path, processed_audio,
                output, sub_pos)
