<a href="https://colab.research.google.com/github/bit-bangin/GetYouTubeTranscript/blob/main/GetYouTubeTranscript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Given URL > Video ID

In [None]:
from urllib.parse import urlparse, parse_qs

def extract_video_id(url):
  '''
  Function to extract the video ID from a YouTube URL.
  PARAMETERS:
  url (type: str): The YouTube URL
  RETURN:
  video_id 'v' (type: str): Extracted video ID, or None if no ID found.
  '''
  # Use urlparse to break up the URL into components.
  parsed_url = urlparse(url)
  
  # Use parse_qs to parse the query string (after '?') from given URL.
  # This returns a dictionary where keys are parameter names ('v' as it were)
  # and values are lists of corresponding values. 
  video_id = parse_qs(parsed_url.query).get('v')
  
  # If 'v' parameter is found in the dictionary, return it's value (video ID)
  # If not found, return None.
  if video_id:
    return video_id[0] # 'v' parameter's value is the video ID
  else:
    return None # 'v' parameter not found in given URL

# Given Video ID > Return Transcript

In [None]:
# Import the YouTubeTranscriptApi library.
# Provides functionality with YouTube to fetch transcripts
from youtube_transcript_api import YouTubeTranscriptApi

def get_transcript(video_id):
  '''
  Function to fetch a transcript given a YouTube video ID. 
  PARAMETERS:
  video_id (type: str): YouTube video ID
  RETURNS:
  transcript (type: list of dict):
  The transcript is returned as a list of dictionaries.
  Each dictionary represents a segment of the transcript, contains text, 
  start time, and duration.
  '''
  try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    # If fetch was successful, return transcript as list of dictionaries.
    return transcript
    # If an error occurs during fetch, catch exception. 
  except Exception as e:
    # Print an error message. Placeholder {e} is replaced with details.
    print(f"An error occurred: {e}")
    # Since an error occurred, return type None. 
    return None

# Format data

In [None]:
def format_transcript(transcript):
  '''
  Function to format the returned transcript.
  PARAMETERS:
  transcript (list): Data from the transcript as a list of dictionaries.
  RETURNS:
  str: The formatted transcript as a single string. 
  '''

  # Use a list comprehension to extract the 'text' from each dictionary.
  # The resultant texts list is a list of strings.
  texts = [segment['text'] for segment in transcript]

  # Join the strings in the texts list into a single string, with each 
  # string separated by a space. 
  formatted_transcript = ' '.join(texts)

  return formatted_transcript

# Write data to specified location

In [1]:
def output_transcript(transcript, filename):
  '''
  Function to output the transcript to a text file. 
  PARAMETERS:
  transcript (str): Reformatted script
  filename (str): Name of file to which to write within CD
  RETURNS:
  None
  '''

  # Open the specified file in write mode ('w').
  # If file exists - overwrite. If not - create. 
  with open(filename, 'w') as file:
    # Write the transcript to the file.
    file.write(transcript)

# Notes
The function `output_transcript` will create/open the file referenced in the current working directory. 
If you want to indicate a new location, provide the full filepath, like so: 
### UNIX
`output_transcript(formatted_transcript, '/Users /yourusername /Desktop /transcripts /transcripts.txt')`
### WINDOWS
`output_transcript(formatted_transcript, 'C:\\Users\\yourusername\\Desktop\\transcripts\\transcript.txt')`

### os Module
Alternatively, you can leverage Python's **os** module to manipulate the paths in a manner that is independent of the native operating system.

> `import os`
> Construct the path
> `path = os.path.join('Users', 'yourusername', 'Desktop', 'transcripts', 'transcript.txt')`
> Pass the path to output_transcript
> `output_transcript(formatted_transcript, path)`











# Integrate components
Prior to runtime, be sure to run pip install <b>youtube_transcript_api</b>: 


---


`pip install youtube_transcript_api`

In [6]:
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi

# Function to extract video ID from URL
def extract_video_id(url):
    '''
    Function to extract the video ID from a YouTube URL.

    Parameters:
    url (str): The YouTube URL.

    Returns:
    str: The extracted video ID.
    '''

    # Parse the URL
    parsed_url = urlparse(url)

    # Extract the video ID from the 'v' query parameter
    video_id = parse_qs(parsed_url.query).get('v')
    if video_id:
        return video_id[0]
    else:
        return None

# Function to obtain transcript
def get_transcript(video_id):
    '''
    Function to get the transcript of a YouTube video.

    Parameters:
    video_id (str): The YouTube video ID.

    Returns:
    list: The video transcript.
    '''

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to format transcript
def format_transcript(transcript):
    '''
    Function to format the transcript.

    Parameters:
    transcript (list): The transcript data as a list of dictionaries.

    Returns:
    str: The formatted transcript as a single string.
    '''

    # Use a list comprehension to extract the 'text' from each dictionary.
    texts = [segment['text'] for segment in transcript]

    # Join the strings in the texts list into a single string.
    formatted_transcript = ' '.join(texts)

    return formatted_transcript
"""
# Function to output transcript (print)
def output_transcript(transcript):
  '''
  Function to output the transcript to the console.
  
  Parameters:
  transcript (str): The reformatted transcript.
  
  Returns:
  None
  '''
  print(transcript)
 """
# Function to output transcript (save)
def output_transcript(transcript, filename):
  '''
  Function to output the trasncript to a file. 

  Parameters:
  transcript(str): The reformatted transcript.

  Returns:
  None
  '''
  # Open the specified file in write mode ('w').
  # If file already exists - overwrite. If new - create.
  with open(filename, 'w') as file:
    # Write the transcript to the file.
    file.write(transcript)

# Main function
def main(url, filename):
    '''
    Main function to get and output the transcript of a YouTube video.

    Parameters:
    url (str): The YouTube video URL.
    filename (str): The name of the file (within cd) to which to save transcript.

    Returns:
    None
    '''

    # Extract the video ID from the URL.
    video_id = extract_video_id(url)

    # Get the transcript.
    transcript = get_transcript(video_id)

    # Format the transcript.
    formatted_transcript = format_transcript(transcript)

    # Output the transcript.
    output_transcript(formatted_transcript, filename)

# Test the main function
main("https://www.youtube.com/watch?v=dQw4w9WgXcQ", "transcript.txt")
