In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
import youtube_dl
from pytube import Playlist
import re
import pandas as pd
import os




In [3]:
def obtener_enlaces_lista_reproduccion(url_lista):
    ydl_opts = {'extract_flat': True}
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        playlist_info = ydl.extract_info(url_lista, download=False)
        enlaces = [video['url'] for video in playlist_info['entries']]
    return enlaces

def obtener_subtitulos_video(url_video):
    try:
        srt = YouTubeTranscriptApi.get_transcript(url_video, languages=['es'])
        subtitulos = [i['text'] for i in srt]
        subtitulos = ' '.join(subtitulos)
        return subtitulos
    except:
        return None

def obtener_subtitulos_videos(lista_enlaces):
    subtitulos = []
    for enlace in lista_enlaces:
        sub = obtener_subtitulos_video(enlace)
        if sub:
            subtitulos.append(sub)
        else:
            subtitulos.append(None)
    return subtitulos

def obtener_titulos_lista_reproduccion(url_lista):
    ydl_opts = {
        'extract_flat': True,
        'quiet': True
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        playlist_info = ydl.extract_info(url_lista, download=False)
        titulos = [video['title'] for video in playlist_info['entries']]
    return titulos

In [4]:
# Diccionario de listas de reproducción con sus nombres
URL = { 'Casanare': 'https://www.youtube.com/playlist?list=PLhXvc-slmy2EWBpKNgLcXGfHr-_6ZvvGR',
    "CostaCaribe": 'https://www.youtube.com/playlist?list=PLbtegW3d3L4IjZw5ZdFqDEr3H8x3dZ_sh',
    "Meta": 'https://www.youtube.com/playlist?list=PLbtegW3d3L4IP1S5w5_jxrCtI686L03dh',
    "Huila": 'https://www.youtube.com/playlist?list=PLbtegW3d3L4Jxv0UENoIIqo3WYHoNQ3GA',
    "Antioquia": 'https://www.youtube.com/playlist?list=PLbtegW3d3L4IzLqaub8l87_fTQvSiZ47C',
    "NorteSantander": 'https://www.youtube.com/playlist?list=PLbtegW3d3L4KerLKcosDcd9FpE8CPQJVk'
}

In [5]:
for region, playlist_url in URL.items():
    print(f"Processing playlist for {region}...")
    
    lista_enlaces = obtener_enlaces_lista_reproduccion(playlist_url)
    subtitulos = obtener_subtitulos_videos(lista_enlaces)
    titulos_videos = obtener_titulos_lista_reproduccion(playlist_url)

    df = pd.DataFrame({
        'Region': [region] * len(titulos_videos),
        'Titulo': titulos_videos,
        'Subtitulos': subtitulos
    })

    # Save to CSV
    filename = f'{region}_subtitles.csv'
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"Saved data for {region} to {filename}")

Processing playlist for Casanare...
[youtube:tab] PLhXvc-slmy2EWBpKNgLcXGfHr-_6ZvvGR: Downloading webpage
[download] Downloading playlist: Caso 03 | Subcaso Casanare
[youtube:tab] Downloading page 1
[youtube:tab] playlist Caso 03 | Subcaso Casanare: Downloading 101 videos
[download] Downloading video 1 of 101
[download] Downloading video 2 of 101
[download] Downloading video 3 of 101
[download] Downloading video 4 of 101
[download] Downloading video 5 of 101
[download] Downloading video 6 of 101
[download] Downloading video 7 of 101
[download] Downloading video 8 of 101
[download] Downloading video 9 of 101
[download] Downloading video 10 of 101
[download] Downloading video 11 of 101
[download] Downloading video 12 of 101
[download] Downloading video 13 of 101
[download] Downloading video 14 of 101
[download] Downloading video 15 of 101
[download] Downloading video 16 of 101
[download] Downloading video 17 of 101
[download] Downloading video 18 of 101
[download] Downloading video 19 of

In [6]:
print("All playlists processed.")

All playlists processed.
