In [None]:
!pip install yt_dlp
!pip install pydub
!pip install openai-whisper torch
!pip install youtube_transcript_api



In [1]:
import yt_dlp
import os
from pydub import AudioSegment
import whisper
import json
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled

In [None]:
def download_audio_from_playlist_with_arabic_transcription(playlist_url, output_path='downloads'):
    """
    Downloads the audio of all videos in a YouTube playlist, renames each file sequentially (e.g., video1, video2),
    and retrieves Arabic transcriptions if available, splitting them by time segments.
    """
    output_path = os.path.abspath(output_path)
    os.makedirs(output_path, exist_ok=True)

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': os.path.join(output_path, 'video%(playlist_index)d.%(ext)s'),  # Sequential naming
        'quiet': True,
        'noplaylist': False,
        'playlist_items': '1:24',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '50',
        }],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        print(f"Downloading audio from playlist: {playlist_url}")
        try:
            playlist_info = ydl.extract_info(playlist_url, download=False)
            if 'entries' not in playlist_info:
                print("No entries found in the playlist.")
                return
            
            # Process each video in the playlist
            for index, entry in enumerate(playlist_info['entries'], start=1):
                video_id = entry.get('id')
                video_title = f"video{index}"  # Simple naming
                audio_path = os.path.join(output_path, f"{video_title}.wav")
                print(f"Processing {video_title}: {audio_path}")

                # Retrieve Arabic transcription
                try:
                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['ar'])
                    
                    # Split the audio and transcript
                    split_audio_and_transcript(audio_path, transcript, video_title, output_path)
                    
                except (NoTranscriptFound, TranscriptsDisabled):
                    print(f"No Arabic transcription available for {video_title}")
        except Exception as e:
            print(f"Error downloading playlist: {e}")

In [3]:
def split_audio_and_transcript(audio_path, transcript, title, output_path, segment_duration=300):
    """
    Splits the audio file and transcript into segments of a specified duration.
    """
    audio = AudioSegment.from_wav(audio_path)
    total_duration = len(audio) / 1000  # in seconds

    segments_dir = os.path.join(output_path, f"video_segments")
    os.makedirs(segments_dir, exist_ok=True)
    
    for i in range(0, int(total_duration), segment_duration):
        start_time = i * 1000  # milliseconds
        end_time = min((i + segment_duration) * 1000, len(audio))

        audio_segment = audio[start_time:end_time]
        segment_filename = f"{title}_segment_{i // segment_duration + 1}.wav"
        segment_path = os.path.join(segments_dir, segment_filename)
        audio_segment.export(segment_path, format="wav")
        
        transcript_segment = [
            entry['text'] for entry in transcript
            if start_time / 1000 <= entry['start'] < end_time / 1000
        ]
        
        transcript_filename = f"{title}_segment_{i // segment_duration + 1}.txt"
        transcript_path = os.path.join(segments_dir, transcript_filename)
        with open(transcript_path, 'w', encoding='utf-8') as f:
            f.write("\n".join(transcript_segment))
        
        print(f"Saved audio segment: {segment_path}")
        print(f"Saved transcript segment: {transcript_path}")


In [None]:
playlist_url = "https://www.youtube.com/playlist?list=PLK6R01zBa3vJKurhwVvRhjQxpC60jrSO3"
download_audio_from_playlist_with_arabic_transcription(playlist_url)