-----

## Setup

### Load package(s)

In [22]:
from pytubefix import Playlist, YouTube

from youtube_transcript_api import YouTubeTranscriptApi

import os


### Definitions

In [15]:
def list_files(directory):
    try:
        # Get list of all files in the directory
        files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
        return files
    except FileNotFoundError:
        print(f"Directory '{directory}' not found.")
        return []
    except PermissionError:
        print(f"Permission denied for directory '{directory}'.")
        return []


In [23]:
def get_transcript(video_url):
    try:
        # Extract the video ID from the URL
        video_id = video_url.split('v=')[1]

        # Retrieve the transcript for the video
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages = ("en",))

        # Concatenate the text from each transcript segment
        transcript_text = ' '.join([segment['text'] for segment in transcript])

        return transcript_text

    except Exception as e:
        print(f'An error occurred: {str(e)}')

----

## Check

In [2]:
yt = YouTube('https://www.youtube.com/watch?v=0LN-Y-kEgmY')

In [3]:
print(yt.captions)
print(yt.title)
print(yt.publish_date)

{'a.en': <Caption lang="English (auto-generated)" code="a.en">}
Michael Hudson: The Industrial Capitalism of China and Russia versus US Neoliberalism
2025-04-20 05:00:02-07:00


----- 

## Manual playlist

In [None]:
urlsFiltered = ['ODbGERF32U8', '96r3ahP0N6c', '0LN-Y-kEgmY', 'bTnVECM81H0', 'BBMgw95gR-8']

----

## Playlist

In [4]:
# Dialogue works
# channelName = "DialogueWorks"
URL_PLAYLIST = "https://www.youtube.com/playlist?list=PLzImU_KHY9-JjAIg0AZXxHaMhJD6UXfNj"

# channelName = "NewAtlas"
# URL_PLAYLIST = "https://www.youtube.com/playlist?list=PLkhlYgGXRhhxHzOmBH10Cp0nGBWjLATjc"

# channelName = "GlennDiesen"
# URL_PLAYLIST = "https://www.youtube.com/@GDiesen1/videos"

# Retrieve URLs of videos from playlist
playlist = Playlist(URL_PLAYLIST)
print('Number Of Videos In playlist: %s' % len(playlist.video_urls))

urls = []
for url in playlist:
    urls.append(url)

Number Of Videos In playlist: 1227


In [5]:
ytObjects = []
for u in urls:
    ytObjects.append(YouTube(u))

In [6]:
titles = [obj.title for obj in ytObjects]
len(titles)

1227

In [7]:
focusIndexes = []
k = 0
for t in titles:
    if "Wolff" in t or 'Hundson' in t:
        focusIndexes.append(k)
    k = k + 1 

len(focusIndexes)

70

In [8]:
ytObjectsFiltered = [ ytObjects[i] for i in focusIndexes ]
len(ytObjectsFiltered)

70

In [9]:
ytObjectsFiltered[0].title

'Richard Wolff and Michael Hudson: Is Trump Winning?'

In [10]:
ytObjectsFiltered[0].watch_url

'https://youtube.com/watch?v=NNHBd5t-rk0'

In [11]:
urlsFiltered = [ obj.watch_url for obj in ytObjectsFiltered ]
len(urlsFiltered)

70

In [12]:
urlToTitleRules = {obj.watch_url.split("v=")[1] : obj.title for obj in ytObjectsFiltered}
len(urlToTitleRules)

70

In [13]:
list(urlToTitleRules.items())[0:5]

[('NNHBd5t-rk0', 'Richard Wolff and Michael Hudson: Is Trump Winning?'),
 ('4G98DeCZcT8',
  'Richard Wolff & Michael Hudson: Trump’s Trade War Collapse: How China Forced a U.S. Retreat'),
 ('u6y2oUC4ROg',
  'Richard Wolff & Michael Hudson Unleash on Trump’s Trade Claims | Brutal Takedown'),
 ('NGfr2Xu8qt4',
  'Richard D. Wolff: Trump and the Fall of the American Empire'),
 ('oIJx9l6HqjA',
  'Richard D. Wolff and Michael Hudson: China Strikes Back as Trump Ignites New Tariff War!')]

-------

## Get YouTube transcripts

In [35]:
previously_ingested = list_files('../../texts/DialogueWorks/Wolff-Hudson')
previously_ingested = [x.replace(".txt", "") for x in previously_ingested]

len(previously_ingested)

67

In [28]:
urlsFiltered2 = [x for x in urlsFiltered if x.replace("https://youtube.com/watch?v=", "") not in previously_ingested]
len(urlsFiltered2)

3

In [37]:
transcripts = { u : get_transcript(u) for u in urlsFiltered2}
print(len(transcripts))

3


In [None]:
list(transcripts.items())[0][1]

In [None]:
list(transcripts.items())[0]

In [31]:
docs = { u.split("v=")[1] : t for u, t in transcripts.items() }
len(docs)

3

------

## Export

In [38]:
dirName = "../../texts/DialogueWorks/New"

In [39]:
for p in list(docs.items()):
    fileName = dirName + "/" + p[0] + ".txt"
    with open(fileName, 'w') as f:
        f.write(p[1])