In [105]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
from spacy.tokens import DocBin
import json
from youtube_transcript_api import YouTubeTranscriptApi
import string
import contractions
import os
import sys

In [106]:
list_video_id = [
    'rm_j6O8y148',
    'Lv0PkSkKeSo',
    '5zA6OFpkPe0',
    'Oz18u64bM8I',
    'p0MvovsCxCk',
    '7cPLbiblb84'
]

In [107]:
def cleaning_sentence(text):
    """
    Cleans a sentence by normalizing case, expanding contractions, 
    removing punctuation and unwanted tokens like 'music'.

    Steps performed:
    - Converts text to lowercase
    - Expands contractions (e.g., "don't" -> "do not")
    - Removes all punctuation characters
    - Removes the word 'music'
    - Removes extra spaces

    Args:
        text (str): The input sentence to be cleaned.

    Returns:
        str: The cleaned and normalized sentence.
    """

    text = text.lower()
    PUNCT_TO_REMOVE = string.punctuation
    ans = contractions.fix(text).translate(str.maketrans('', '', PUNCT_TO_REMOVE))
    ans = ans.replace('music','')
    ans = " ".join(ans.split())
    return ans

In [None]:
def fetch_transcript(video_id):
    """
    Fetches the English transcript for a given YouTube video and saves it to a text file.

    This function attempts up to 3 times to retrieve and translate the transcript of a YouTube 
    video into English or US English. The transcript is then processed using spaCy's NLP pipeline 
    and each cleaned sentence is written to a file named `transcript_<video_id>.txt` inside the 
    `wanderly.ai/data/transcripts/` directory.

    Args:
        video_id (str): The unique ID of the YouTube video.

    Raises:
        Exception: If the transcript could not be generated after 3 attempts.

    """

    try:
        attempt = 0
        success = False
        # while ((not success) and (attempt<=3)):
        ytt_api = YouTubeTranscriptApi()
        fetched_transcript = ytt_api.list(video_id)
        compleat_en_transcrit = []
        for transcript in fetched_transcript:
            print('here')
            ts = (transcript.translate('en').fetch())   
            for i in ts:
                compleat_en_transcrit.append(i.text)
            print('here 2')

        full_ts = ' '.join(compleat_en_transcrit)
        nlp = spacy.load("en_core_web_lg")
        doc = nlp(full_ts)
        root = os.getcwd().split('wanderly.ai')[0]
        file_path = os.path.join(root, f'wanderly.ai/data/transcripts/')
        if not os.path.exists(file_path):
            os.mkdir(file_path)
        file = open(file_path+f'transcript_{video_id}.txt',"w")
        for sent in doc.sents:
            try:
                file.write(cleaning_sentence(sent.text))
                file.write('\n')
                print(sent)
            except:
                print(sent.text)

        file.close()
        #     success = True

        # if not success: 
        #     raise Exception("Error in generating transcripts")

    except Exception as err:
        print(err)
    finally:
        print(f'Finish {video_id} the transcripts')


In [43]:
for video_id in tqdm(list_video_id):
    fetch_transcript(video_id)

  0%|          | 0/6 [00:00<?, ?it/s]

here


 17%|█▋        | 1/6 [00:01<00:07,  1.47s/it]

no element found: line 1, column 0
Finish rm_j6O8y148 the transcripts


 33%|███▎      | 2/6 [00:02<00:05,  1.38s/it]

here
no element found: line 1, column 0
Finish Lv0PkSkKeSo the transcripts


 50%|█████     | 3/6 [00:03<00:03,  1.11s/it]

here
no element found: line 1, column 0
Finish 5zA6OFpkPe0 the transcripts


 67%|██████▋   | 4/6 [00:04<00:01,  1.04it/s]

here
no element found: line 1, column 0
Finish Oz18u64bM8I the transcripts
here


 83%|████████▎ | 5/6 [00:05<00:00,  1.02it/s]

no element found: line 1, column 0
Finish p0MvovsCxCk the transcripts


100%|██████████| 6/6 [00:06<00:00,  1.04s/it]

here
no element found: line 1, column 0
Finish 7cPLbiblb84 the transcripts





In [10]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
    TranscriptsDisabled,
    NoTranscriptFound,
    NotTranslatable,
    VideoUnavailable
)

def fetch_transcript_text(video_id):
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

        # Try to get directly available English transcript
        try:
            transcript = transcript_list.find_transcript(['en', 'en-US'])
            ts = transcript.fetch()
            print('here')
        except NoTranscriptFound:
            # Try translating if English transcript isn't available
            try:
                transcript = transcript_list.find_transcript('hi')
                print('here 1')
                if transcript.is_translatable:
                    ts = transcript.translate('en').fetch()
                    print('here 2')
                else:
                    print(f"Transcript for {video_id} is not translatable.")
                    return None
                print(3)
            except Exception as e:
                print(f"Translation failed for {video_id}: {e}")
                return None

        return [entry['text'] for entry in ts]

    except (TranscriptsDisabled, NoTranscriptFound, NotTranslatable, VideoUnavailable) as e:
        print(f"Transcript not available for {video_id}: {e}")
        return None


In [11]:
fetch_transcript_text('rm_j6O8y148')

Translation failed for rm_j6O8y148: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=rm_j6O8y148! This is most likely caused by:

No transcripts were found for any of the requested language codes: hi

For this video (rm_j6O8y148) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - hi ("Hindi (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - ab ("Abkhazian")
 - aa ("Afar")
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - ba ("Bashkir")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - br ("Breton")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Corsican")
 - hr ("Croatian")
 - cs ("Czech")
 - da ("Danish")
 - dv ("Divehi")
 - nl ("Dut

In [None]:
transcript_list = YouTubeTranscriptApi.list_transcripts('5zA6OFpkPe0')
for i in transcript_list:
    i.translate('en').fetch()
    

TypeError: string argument expected, got 'ExpatError'

TypeError: string argument expected, got 'ExpatError'

In [30]:
transcript_list = YouTubeTranscriptApi.get_transcript('5zA6OFpkPe0')
for i in transcript_list:
    print(i)
    

NoTranscriptFound: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=5zA6OFpkPe0! This is most likely caused by:

No transcripts were found for any of the requested language codes: ('en',)

For this video (5zA6OFpkPe0) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - hi ("Hindi (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - ab ("Abkhazian")
 - aa ("Afar")
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - ba ("Bashkir")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - br ("Breton")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Corsican")
 - hr ("Croatian")
 - cs ("Czech")
 - da ("Danish")
 - dv ("Divehi")
 - nl ("Dutch")
 - dz ("Dzongkha")
 - en ("English")
 - eo ("Esperanto")
 - et ("Estonian")
 - ee ("Ewe")
 - fo ("Faroese")
 - fj ("Fijian")
 - fil ("Filipino")
 - fi ("Finnish")
 - fr ("French")
 - gaa ("Ga")
 - gl ("Galician")
 - lg ("Ganda")
 - ka ("Georgian")
 - de ("German")
 - el ("Greek")
 - gn ("Guarani")
 - gu ("Gujarati")
 - ht ("Haitian Creole")
 - ha ("Hausa")
 - haw ("Hawaiian")
 - iw ("Hebrew")
 - hi ("Hindi")
 - hmn ("Hmong")
 - hu ("Hungarian")
 - is ("Icelandic")
 - ig ("Igbo")
 - id ("Indonesian")
 - iu ("Inuktitut")
 - ga ("Irish")
 - it ("Italian")
 - ja ("Japanese")
 - jv ("Javanese")
 - kl ("Kalaallisut")
 - kn ("Kannada")
 - kk ("Kazakh")
 - kha ("Khasi")
 - km ("Khmer")
 - rw ("Kinyarwanda")
 - ko ("Korean")
 - kri ("Krio")
 - ku ("Kurdish")
 - ky ("Kyrgyz")
 - lo ("Lao")
 - la ("Latin")
 - lv ("Latvian")
 - ln ("Lingala")
 - lt ("Lithuanian")
 - lua ("Luba-Lulua")
 - luo ("Luo")
 - lb ("Luxembourgish")
 - mk ("Macedonian")
 - mg ("Malagasy")
 - ms ("Malay")
 - ml ("Malayalam")
 - mt ("Maltese")
 - gv ("Manx")
 - mi ("Māori")
 - mr ("Marathi")
 - mn ("Mongolian")
 - mfe ("Morisyen")
 - ne ("Nepali")
 - new ("Newari")
 - nso ("Northern Sotho")
 - no ("Norwegian")
 - ny ("Nyanja")
 - oc ("Occitan")
 - or ("Odia")
 - om ("Oromo")
 - os ("Ossetic")
 - pam ("Pampanga")
 - ps ("Pashto")
 - fa ("Persian")
 - pl ("Polish")
 - pt ("Portuguese")
 - pt-PT ("Portuguese (Portugal)")
 - pa ("Punjabi")
 - qu ("Quechua")
 - ro ("Romanian")
 - rn ("Rundi")
 - ru ("Russian")
 - sm ("Samoan")
 - sg ("Sango")
 - sa ("Sanskrit")
 - gd ("Scottish Gaelic")
 - sr ("Serbian")
 - crs ("Seselwa Creole French")
 - sn ("Shona")
 - sd ("Sindhi")
 - si ("Sinhala")
 - sk ("Slovak")
 - sl ("Slovenian")
 - so ("Somali")
 - st ("Southern Sotho")
 - es ("Spanish")
 - su ("Sundanese")
 - sw ("Swahili")
 - ss ("Swati")
 - sv ("Swedish")
 - tg ("Tajik")
 - ta ("Tamil")
 - tt ("Tatar")
 - te ("Telugu")
 - th ("Thai")
 - bo ("Tibetan")
 - ti ("Tigrinya")
 - to ("Tongan")
 - ts ("Tsonga")
 - tn ("Tswana")
 - tum ("Tumbuka")
 - tr ("Turkish")
 - tk ("Turkmen")
 - uk ("Ukrainian")
 - ur ("Urdu")
 - ug ("Uyghur")
 - uz ("Uzbek")
 - ve ("Venda")
 - vi ("Vietnamese")
 - war ("Waray")
 - cy ("Welsh")
 - fy ("Western Frisian")
 - wo ("Wolof")
 - xh ("Xhosa")
 - yi ("Yiddish")
 - yo ("Yoruba")
 - zu ("Zulu")

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [23]:
for i in transcript_list:
    print(i.is_translatable)
    i.translate('en')


True


ERROR! Session/line number was not unique in database. History logging moved to new session 77


In [26]:
transcript_list = YouTubeTranscriptApi().list('rm_j6O8y148')
for i in transcript_list:
    print(i.is_translatable)
    k = i.translate('en')
    k['text']

ERROR! Session/line number was not unique in database. History logging moved to new session 78
True


TypeError: 'Transcript' object is not subscriptable

In [27]:
ytt_api = YouTubeTranscriptApi()
fetched_transcript = ytt_api.fetch('rm_j6O8y148')

# is iterable
for snippet in fetched_transcript:
    print(snippet.text)


ERROR! Session/line number was not unique in database. History logging moved to new session 79


NoTranscriptFound: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=rm_j6O8y148! This is most likely caused by:

No transcripts were found for any of the requested language codes: ('en',)

For this video (rm_j6O8y148) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - hi ("Hindi (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - ab ("Abkhazian")
 - aa ("Afar")
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - ba ("Bashkir")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - br ("Breton")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Corsican")
 - hr ("Croatian")
 - cs ("Czech")
 - da ("Danish")
 - dv ("Divehi")
 - nl ("Dutch")
 - dz ("Dzongkha")
 - en ("English")
 - eo ("Esperanto")
 - et ("Estonian")
 - ee ("Ewe")
 - fo ("Faroese")
 - fj ("Fijian")
 - fil ("Filipino")
 - fi ("Finnish")
 - fr ("French")
 - gaa ("Ga")
 - gl ("Galician")
 - lg ("Ganda")
 - ka ("Georgian")
 - de ("German")
 - el ("Greek")
 - gn ("Guarani")
 - gu ("Gujarati")
 - ht ("Haitian Creole")
 - ha ("Hausa")
 - haw ("Hawaiian")
 - iw ("Hebrew")
 - hi ("Hindi")
 - hmn ("Hmong")
 - hu ("Hungarian")
 - is ("Icelandic")
 - ig ("Igbo")
 - id ("Indonesian")
 - iu ("Inuktitut")
 - ga ("Irish")
 - it ("Italian")
 - ja ("Japanese")
 - jv ("Javanese")
 - kl ("Kalaallisut")
 - kn ("Kannada")
 - kk ("Kazakh")
 - kha ("Khasi")
 - km ("Khmer")
 - rw ("Kinyarwanda")
 - ko ("Korean")
 - kri ("Krio")
 - ku ("Kurdish")
 - ky ("Kyrgyz")
 - lo ("Lao")
 - la ("Latin")
 - lv ("Latvian")
 - ln ("Lingala")
 - lt ("Lithuanian")
 - lua ("Luba-Lulua")
 - luo ("Luo")
 - lb ("Luxembourgish")
 - mk ("Macedonian")
 - mg ("Malagasy")
 - ms ("Malay")
 - ml ("Malayalam")
 - mt ("Maltese")
 - gv ("Manx")
 - mi ("Māori")
 - mr ("Marathi")
 - mn ("Mongolian")
 - mfe ("Morisyen")
 - ne ("Nepali")
 - new ("Newari")
 - nso ("Northern Sotho")
 - no ("Norwegian")
 - ny ("Nyanja")
 - oc ("Occitan")
 - or ("Odia")
 - om ("Oromo")
 - os ("Ossetic")
 - pam ("Pampanga")
 - ps ("Pashto")
 - fa ("Persian")
 - pl ("Polish")
 - pt ("Portuguese")
 - pt-PT ("Portuguese (Portugal)")
 - pa ("Punjabi")
 - qu ("Quechua")
 - ro ("Romanian")
 - rn ("Rundi")
 - ru ("Russian")
 - sm ("Samoan")
 - sg ("Sango")
 - sa ("Sanskrit")
 - gd ("Scottish Gaelic")
 - sr ("Serbian")
 - crs ("Seselwa Creole French")
 - sn ("Shona")
 - sd ("Sindhi")
 - si ("Sinhala")
 - sk ("Slovak")
 - sl ("Slovenian")
 - so ("Somali")
 - st ("Southern Sotho")
 - es ("Spanish")
 - su ("Sundanese")
 - sw ("Swahili")
 - ss ("Swati")
 - sv ("Swedish")
 - tg ("Tajik")
 - ta ("Tamil")
 - tt ("Tatar")
 - te ("Telugu")
 - th ("Thai")
 - bo ("Tibetan")
 - ti ("Tigrinya")
 - to ("Tongan")
 - ts ("Tsonga")
 - tn ("Tswana")
 - tum ("Tumbuka")
 - tr ("Turkish")
 - tk ("Turkmen")
 - uk ("Ukrainian")
 - ur ("Urdu")
 - ug ("Uyghur")
 - uz ("Uzbek")
 - ve ("Venda")
 - vi ("Vietnamese")
 - war ("Waray")
 - cy ("Welsh")
 - fy ("Western Frisian")
 - wo ("Wolof")
 - xh ("Xhosa")
 - yi ("Yiddish")
 - yo ("Yoruba")
 - zu ("Zulu")

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [None]:
# ytt_api = YouTubeTranscriptApi()
# fetched_transcript = ytt_api.fetch('rm_j6O8y148')

# # is iterable
# for snippet in fetched_transcript:
#     print(snippet.text)

NoTranscriptFound: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=rm_j6O8y148! This is most likely caused by:

No transcripts were found for any of the requested language codes: ('en',)

For this video (rm_j6O8y148) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - hi ("Hindi (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - ab ("Abkhazian")
 - aa ("Afar")
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - ba ("Bashkir")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - br ("Breton")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Corsican")
 - hr ("Croatian")
 - cs ("Czech")
 - da ("Danish")
 - dv ("Divehi")
 - nl ("Dutch")
 - dz ("Dzongkha")
 - en ("English")
 - eo ("Esperanto")
 - et ("Estonian")
 - ee ("Ewe")
 - fo ("Faroese")
 - fj ("Fijian")
 - fil ("Filipino")
 - fi ("Finnish")
 - fr ("French")
 - gaa ("Ga")
 - gl ("Galician")
 - lg ("Ganda")
 - ka ("Georgian")
 - de ("German")
 - el ("Greek")
 - gn ("Guarani")
 - gu ("Gujarati")
 - ht ("Haitian Creole")
 - ha ("Hausa")
 - haw ("Hawaiian")
 - iw ("Hebrew")
 - hi ("Hindi")
 - hmn ("Hmong")
 - hu ("Hungarian")
 - is ("Icelandic")
 - ig ("Igbo")
 - id ("Indonesian")
 - iu ("Inuktitut")
 - ga ("Irish")
 - it ("Italian")
 - ja ("Japanese")
 - jv ("Javanese")
 - kl ("Kalaallisut")
 - kn ("Kannada")
 - kk ("Kazakh")
 - kha ("Khasi")
 - km ("Khmer")
 - rw ("Kinyarwanda")
 - ko ("Korean")
 - kri ("Krio")
 - ku ("Kurdish")
 - ky ("Kyrgyz")
 - lo ("Lao")
 - la ("Latin")
 - lv ("Latvian")
 - ln ("Lingala")
 - lt ("Lithuanian")
 - lua ("Luba-Lulua")
 - luo ("Luo")
 - lb ("Luxembourgish")
 - mk ("Macedonian")
 - mg ("Malagasy")
 - ms ("Malay")
 - ml ("Malayalam")
 - mt ("Maltese")
 - gv ("Manx")
 - mi ("Māori")
 - mr ("Marathi")
 - mn ("Mongolian")
 - mfe ("Morisyen")
 - ne ("Nepali")
 - new ("Newari")
 - nso ("Northern Sotho")
 - no ("Norwegian")
 - ny ("Nyanja")
 - oc ("Occitan")
 - or ("Odia")
 - om ("Oromo")
 - os ("Ossetic")
 - pam ("Pampanga")
 - ps ("Pashto")
 - fa ("Persian")
 - pl ("Polish")
 - pt ("Portuguese")
 - pt-PT ("Portuguese (Portugal)")
 - pa ("Punjabi")
 - qu ("Quechua")
 - ro ("Romanian")
 - rn ("Rundi")
 - ru ("Russian")
 - sm ("Samoan")
 - sg ("Sango")
 - sa ("Sanskrit")
 - gd ("Scottish Gaelic")
 - sr ("Serbian")
 - crs ("Seselwa Creole French")
 - sn ("Shona")
 - sd ("Sindhi")
 - si ("Sinhala")
 - sk ("Slovak")
 - sl ("Slovenian")
 - so ("Somali")
 - st ("Southern Sotho")
 - es ("Spanish")
 - su ("Sundanese")
 - sw ("Swahili")
 - ss ("Swati")
 - sv ("Swedish")
 - tg ("Tajik")
 - ta ("Tamil")
 - tt ("Tatar")
 - te ("Telugu")
 - th ("Thai")
 - bo ("Tibetan")
 - ti ("Tigrinya")
 - to ("Tongan")
 - ts ("Tsonga")
 - tn ("Tswana")
 - tum ("Tumbuka")
 - tr ("Turkish")
 - tk ("Turkmen")
 - uk ("Ukrainian")
 - ur ("Urdu")
 - ug ("Uyghur")
 - uz ("Uzbek")
 - ve ("Venda")
 - vi ("Vietnamese")
 - war ("Waray")
 - cy ("Welsh")
 - fy ("Western Frisian")
 - wo ("Wolof")
 - xh ("Xhosa")
 - yi ("Yiddish")
 - yo ("Yoruba")
 - zu ("Zulu")

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [32]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, NotTranslatable
import time

video_id = '5zA6OFpkPe0'

try:
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    print(f"Available transcripts for video {video_id}:\n")

    for transcript in transcript_list:
        print(f"Language: {transcript.language_code}, Translatable: {transcript.is_translatable}")

        try:
            # Prefer original English transcript if available
            if transcript.language_code in ['en', 'en-US']:
                ts = transcript.fetch()
            elif transcript.is_translatable:
                translated = transcript.translate('en')
                print('here')
                ts = translated.fetch()
                print('here 1')
            else:
                print(f"Skipping: transcript in {transcript.language_code} is not translatable.")
                continue

            print(f"\n✅ Successfully fetched {len(ts)} transcript segments from {transcript.language_code}\n")
            print("Sample transcript:")
            for entry in ts[:5]:  # print first 5 lines
                print("•", entry['text'])

            break  # Stop after first successful fetch

        except Exception as fetch_err:
            print(f"❌ Failed to fetch transcript for {transcript.language_code}: {fetch_err}")
        time.sleep(1)  # Add delay to avoid rate limit

except (NoTranscriptFound, TranscriptsDisabled) as e:
    print(f"❌ Transcript not available for video {video_id}: {e}")


Available transcripts for video 5zA6OFpkPe0:

Language: hi, Translatable: True
here
❌ Failed to fetch transcript for hi: no element found: line 1, column 0


In [33]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
    NoTranscriptFound,
    TranscriptsDisabled,
    NotTranslatable,
    VideoUnavailable
)

def safe_fetch_transcript(video_id):
    """
    Tries to fetch English transcript for a video.
    Falls back to translation if no native English transcript is found.
    Skips invalid or broken transcript fetch attempts.
    """
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

        # First, try native English transcript
        try:
            transcript = transcript_list.find_transcript(['en', 'en-US'])
            print("✅ Found native English transcript.")
            return transcript.fetch()
        except NoTranscriptFound:
            print("⚠️ No native English transcript found. Trying translation...")

        # Try to fetch translated transcript from any available source
        for transcript in transcript_list:
            if transcript.is_translatable:
                try:
                    translated = transcript.translate('en')
                    return translated.fetch()
                except Exception as e:
                    print(f"❌ Translation failed for {transcript.language_code}: {e}")
            else:
                print(f"Skipping {transcript.language_code} — not translatable.")

        raise Exception("No valid transcript found or fetchable.")

    except (NoTranscriptFound, TranscriptsDisabled, VideoUnavailable) as e:
        print(f"❌ Transcript not available for video {video_id}: {e}")
        return None

# Example usage:
video_id = "5zA6OFpkPe0"
transcript_data = safe_fetch_transcript(video_id)

if transcript_data:
    print("\n✅ Transcript sample:")
    for entry in transcript_data[:5]:
        print("•", entry['text'])
else:
    print("❌ Could not fetch transcript.")


⚠️ No native English transcript found. Trying translation...
❌ Translation failed for hi: no element found: line 1, column 0


Exception: No valid transcript found or fetchable.

In [38]:
! yt-dlp --write-auto-sub --sub-lang en --skip-download -o "%(id)s.%(ext)s" https://www.youtube.com/watch?v=5zA6OFpkPe0

[youtube] Extracting URL: https://www.youtube.com/watch?v=5zA6OFpkPe0
[youtube] 5zA6OFpkPe0: Downloading webpage
[youtube] 5zA6OFpkPe0: Downloading tv client config
[youtube] 5zA6OFpkPe0: Downloading tv player API JSON
[youtube] 5zA6OFpkPe0: Downloading ios player API JSON
[youtube] 5zA6OFpkPe0: Downloading m3u8 information
[info] 5zA6OFpkPe0: Downloading subtitles: en
[info] 5zA6OFpkPe0: Downloading 1 format(s): 18
[info] Writing video subtitles to: 5zA6OFpkPe0.en.vtt
[download] Destination: 5zA6OFpkPe0.en.vtt

[download]    1.00KiB at   39.10KiB/s (00:00:00)
[download]    3.00KiB at  110.74KiB/s (00:00:00)
[download]    7.00KiB at  246.83KiB/s (00:00:00)
[download]   15.00KiB at  506.21KiB/s (00:00:00)
[download]   31.00KiB at  979.19KiB/s (00:00:00)
[download]   63.00KiB at    1.88MiB/s (00:00:00)
[download]   94.81KiB at    2.75MiB/s (00:00:00)
[download] 100% of   94.81KiB in 00:00:00 at 203.19KiB/s




In [36]:
! pip install -U yt-dlp

Collecting yt-dlp
  Downloading yt_dlp-2025.6.9-py3-none-any.whl (3.3 MB)
     ---------------------------------------- 0.0/3.3 MB ? eta -:--:--
     --- ------------------------------------ 0.3/3.3 MB 8.6 MB/s eta 0:00:01
     ----- ---------------------------------- 0.5/3.3 MB 5.9 MB/s eta 0:00:01
     ----------- ---------------------------- 0.9/3.3 MB 7.2 MB/s eta 0:00:01
     ------------- -------------------------- 1.1/3.3 MB 6.2 MB/s eta 0:00:01
     -------------- ------------------------- 1.2/3.3 MB 6.1 MB/s eta 0:00:01
     --------------- ------------------------ 1.3/3.3 MB 4.7 MB/s eta 0:00:01
     ------------------ --------------------- 1.5/3.3 MB 4.9 MB/s eta 0:00:01
     --------------------- ------------------ 1.8/3.3 MB 4.8 MB/s eta 0:00:01
     ----------------------- ---------------- 1.9/3.3 MB 4.7 MB/s eta 0:00:01
     ------------------------- -------------- 2.1/3.3 MB 4.6 MB/s eta 0:00:01
     ---------------------------- ----------- 2.3/3.3 MB 4.6 MB/s eta 0:00:


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
import yt_dlp

def download_video(url, output_dir="."):
    ydl_opts = {
        'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

if __name__ == '__main__':
    video_url = 'https://www.youtube.com/watch?v=5zA6OFpkPe0'
    download_video(video_url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=5zA6OFpkPe0
[youtube] 5zA6OFpkPe0: Downloading webpage
[youtube] 5zA6OFpkPe0: Downloading tv client config
[youtube] 5zA6OFpkPe0: Downloading player 94f771d8-main
[youtube] 5zA6OFpkPe0: Downloading tv player API JSON
[youtube] 5zA6OFpkPe0: Downloading ios player API JSON




[youtube] 5zA6OFpkPe0: Downloading m3u8 information




[info] 5zA6OFpkPe0: Downloading 1 format(s): 18
[download] Destination: Your Offbeat JAIPUR Itinerary - 11 Less Known Places You MUST Visit ｜ Unique Things to do in Jaipur.mp4
[download] 100% of  109.31MiB in 00:01:51 at 1000.64KiB/s  


In [98]:
fetched_transcript = ytt_api.list(list_video_id[0])

In [50]:
fetched_transcript = ytt_api.list(list_video_id[0])

In [62]:
async def func():
    for i in fetched_transcript:
        a = await i.fetch().to_raw_data
        print(a)


In [64]:
a = func()

In [66]:
a.cr_running

False

In [104]:
for i in fetched_transcript:
    print(i.language_code=='hi')
    # print(language+"_"+len_code.language_code for len_code in k]

True


In [100]:
j

<youtube_transcript_api._transcripts.Transcript at 0x21267efa290>

In [90]:
# k = [len_code for len_code in k]
# 'en-US' in k

In [92]:
English

NameError: name 'English' is not defined

In [93]:
', '.join(k)

'Abkhazian_ab, Afar_aa, Afrikaans_af, Akan_ak, Albanian_sq, Amharic_am, Arabic_ar, Armenian_hy, Assamese_as, Aymara_ay, Azerbaijani_az, Bangla_bn, Bashkir_ba, Basque_eu, Belarusian_be, Bhojpuri_bho, Bosnian_bs, Breton_br, Bulgarian_bg, Burmese_my, Catalan_ca, Cebuano_ceb, Chinese (Simplified)_zh-Hans, Chinese (Traditional)_zh-Hant, Corsican_co, Croatian_hr, Czech_cs, Danish_da, Divehi_dv, Dutch_nl, Dzongkha_dz, English_en, Esperanto_eo, Estonian_et, Ewe_ee, Faroese_fo, Fijian_fj, Filipino_fil, Finnish_fi, French_fr, Ga_gaa, Galician_gl, Ganda_lg, Georgian_ka, German_de, Greek_el, Guarani_gn, Gujarati_gu, Haitian Creole_ht, Hausa_ha, Hawaiian_haw, Hebrew_iw, Hindi_hi, Hmong_hmn, Hungarian_hu, Icelandic_is, Igbo_ig, Indonesian_id, Inuktitut_iu, Irish_ga, Italian_it, Japanese_ja, Javanese_jv, Kalaallisut_kl, Kannada_kn, Kazakh_kk, Khasi_kha, Khmer_km, Kinyarwanda_rw, Korean_ko, Krio_kri, Kurdish_ku, Kyrgyz_ky, Lao_lo, Latin_la, Latvian_lv, Lingala_ln, Lithuanian_lt, Luba-Lulua_lua, Luo_lu

In [68]:
import asyncio

async def fetch_data():
    print("Fetching data...")
    for i in fetched_transcript:
        a = await i.fetch().to_raw_data
        print(a)
    return {"data": "sample data"}

async def main():
    print("Program start")
    data = await fetch_data() # Pause main until fetch_data completes
    print("Received:", data)
    print("Program end")



In [69]:
if __name__ == "__main__":
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop