In [47]:
import os
import json
import time
import argparse

import pandas as pd

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [51]:
parser = argparse.ArgumentParser(description='Scrape music data from a YouTube channel')
parser.add_argument('--url', type=str, help='URL to scrape chat data from')
parser.add_argument('--output', type=str, help='Output file path')
parser.add_argument('--temperature', type=float, default=0.5, help='Temperature for the model')
parser.add_argument('--full', type=bool, default=False, help='Scrape all videos data if True')

_StoreAction(option_strings=['--full'], dest='full', nargs=None, const=None, default=False, type=<class 'bool'>, choices=None, required=False, help='Scrape all videos data if True', metavar=None)

In [50]:
import sys

sys.argv = [
    "notebook",
    "--url", "https://example.com",
    "--output", "output.json",
    "--temperature", "0.8",
    "--full", "True"
]

args = parser.parse_args()

# Verifique os valores
print("URL:", args.url)
print("Output file path:", args.output)
print("Temperature:", args.temperature)
print("Full scrape:", args.full)


URL: https://example.com
Output file path: output.json
Temperature: 0.8
Full scrape: True


In [3]:
DOMAIN = 'https://www.youtube.com/'
CHANNEL = '@GreatStonedDragon'
URL = f'{DOMAIN}{CHANNEL}/videos'

In [None]:
class App:
    def __init__(
        self,
        url: str,
        full: bool=False,
        model_name: str="llama-3.3-70b-versatile", 
        temperature: float=0.7
        ):
        
        self.url = url
        self.full = full
        self.model_name = model_name
        self.temperature = temperature
        self.llm = ChatGroq(model_name=model_name, temperature=temperature)

        self.parser = JsonOutputParser(pydantic_object={
            "type": "object",
            "properties": {
                "artist": {"type": "string"},
                "track": {"type": "string"},
                "title": {"type": "string"},
                "original_title": {"type": "string"},
            }
        })
        
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", """"Extract music details into JSON with this structure:
                {{
                    "artist": "artist name here",
                    "track": "track name here",
                    "title": "full title here, artist + track",
                    "original_title": "original title here"
                }}"""),
            ("user", "{input}")
        ])
        
        self.chain = self.prompt | self.llm | self.parser

        self.titles = self._get_content()
    

    def _parse(self, html: str) -> dict:
        titles = []
        soup = BeautifulSoup(html, "html.parser")
        titles_element = soup.find_all(id="video-title")

        for title in titles_element:
            f_title = title.get_text(strip=True)
            
            titles.append(f_title)
        
        return titles
    

    def _get_content(self) -> list:
        options = Options()
        options.add_argument("--headless")
        driver = webdriver.Firefox(options=options)
        
        driver.get(self.url)
        time.sleep(3)
        
        if self.full:
            last_height = driver.execute_script("return document.documentElement.scrollHeight")

            while True:
                driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
                time.sleep(2)

                new_height = driver.execute_script("return document.documentElement.scrollHeight")
                
                if new_height == last_height:
                    break
                
                last_height = new_height
        
        html = driver.page_source
        driver.quit()
        
        return self._parse(html)
    

    def _ask(self, description: str) -> dict:
        return self.chain.invoke({"input": description})
    
    
app = App(URL)
app.titles