In [None]:
# ✅ Step 2: Import necessary modules
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
from transformers import pipeline
import re

# ✅ Step 3: Function to extract video ID and transcript
def get_clean_transcript(video_url):
    try:
        # Extract video ID from various URL formats
        if "watch?v=" in video_url:
            video_id = video_url.split("v=")[-1].split("&")[0]
        elif "youtu.be/" in video_url:
            video_id = video_url.split("/")[-1].split("?")[0]
        else:
            return "Invalid YouTube URL"

        # Fetch transcript using video ID
        transcript_data = YouTubeTranscriptApi().fetch(video_id)

        # If transcript is a list of dictionaries, combine the text
        if isinstance(transcript_data, list):
            full_text = " ".join([item['text'] for item in transcript_data])
        else:
            # If it's not a list (unlikely), fallback to string
            full_text = str(transcript_data)

        return full_text
    except Exception as e:
        return f"❌ Error fetching transcript: {str(e)}"

# ✅ Step 4: Input YouTube video URL here
video_url = input("🔗 Paste YouTube URL: ").strip()

# ✅ Step 5: Fetch and display transcript
transcript_text = get_clean_transcript(video_url)


def extract_from_fetched_transcript(fetched_data):
    """
    Extract clean text from FetchedTranscript format (like your document)
    """
    # Find all text between quotes after 'text='
    pattern = r"text='([^']+)'"
    matches = re.findall(pattern, fetched_data)
    
    # Join all text snippets
    full_text = " ".join(matches)
    
    # Clean the text
    clean_text = clean_transcript_text(full_text)
    
    return clean_text

def clean_transcript_text(text):
    """
    Clean transcript text by removing unwanted formatting
    """
    # Remove music markers
    text = re.sub(r'\[Music\]', '', text)
    
    # Fix spacing issues
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single space
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Fix common transcript issues
    text = text.replace('  ', ' ')  # Double spaces
    
    return text

lines = []

line = transcript_text

lines.append(line)
        
fetched_data = "\n".join(lines)
clean_text = extract_from_fetched_transcript(fetched_data)
print("\n📝 Clean Transcript:")
print("=" * 40)
print(clean_text)