In [1]:
import os
import json
import time
import logging
import argparse
import requests
import langchain
import webbrowser
import urllib.parse
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup
from pydantic import BaseModel
from typing import List, Dict, Optional

from selenium import webdriver
from selenium.webdriver.firefox.options import Options

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain.schema import OutputParserException
from http.server import BaseHTTPRequestHandler, HTTPServer

In [None]:
user_id = "21rd50pqr1dmmcvhoppd29i3f"
client_id = '94f49f5c700c49bd84f844b30cdc45cc'
client_secret = '10298761326441fd8ca7be29a97214f4'

redirect_uri = 'http://localhost:8888/callback'
scopes = 'user-read-private user-read-email playlist-modify-public'

def get_spotify_auth_code(client_id, redirect_uri, scopes):
    auth_url = 'https://accounts.spotify.com/authorize'
    params = {
        'client_id': client_id,
        'response_type': 'code',
        'redirect_uri': redirect_uri,
        'scope': scopes
    }
    authorization_url = f"{auth_url}?{urllib.parse.urlencode(params)}"
    
    webbrowser.open(authorization_url)
    
    class RequestHandler(BaseHTTPRequestHandler):
        def do_GET(self):
            query = urllib.parse.urlparse(self.path).query
            params = urllib.parse.parse_qs(query)
            if 'code' in params:
                self.send_response(200)
                self.end_headers()
                self.wfile.write(b"Autorizacao concluida! Pode fechar esta janela.")
                self.server.auth_code = params['code'][0]
            else:
                self.send_response(400)
                self.end_headers()
                self.wfile.write(b"Codigo de autorizacao nao encontrado.")
    
    server = HTTPServer(('localhost', 3000), RequestHandler)
    print("Aguardando autorizacao...")
    server.handle_request()
    
    return getattr(server, 'auth_code', None)

code = get_spotify_auth_code(client_id, redirect_uri, scopes)


In [2]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.getLogger("langchain").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)

In [3]:
os.environ['GIST_ID_ACCESS'] = '5269bb5c3d073e28f8f77921f8049f2e'
os.environ['GIST_ID_DATA'] = '08c2efdb315b0504e9f2f5dbca6b020d'
os.environ['GIST_ACCESS_TOKEN'] = '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'

In [4]:
GIST_ID_CREDENTIAL = os.environ['GIST_ID_ACCESS']
GIST_ID_DATA = os.environ['GIST_ID_DATA']
GIST_TOKEN = os.environ['GIST_ACCESS_TOKEN']

In [5]:
def get_file_from_gist(gist_id: str, gist_token: str, file_name: str) -> Optional[Dict]:
    headers = {'Authorization': f'token {gist_token}'}
    url = f'https://api.github.com/gists/{gist_id}'

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        data = response.json()
        files = data.get('files', {})

        if file_name in files:
            return json.loads(files[file_name]['content'])
        else:
            return {}

    except requests.exceptions.RequestException as e:
        print(f"Erro ao fazer a requisição: {e}")
    except json.JSONDecodeError as e:
        print(f"Erro ao decodificar o JSON: {e}")
    except KeyError as e:
        print(f"Erro: Chave ausente na resposta: {e}")

    return None

In [6]:
database = get_file_from_gist(GIST_ID_DATA, GIST_TOKEN, 'report.json')
credentials = get_file_from_gist(GIST_ID_CREDENTIAL, GIST_TOKEN, "youtube-music-scrapping.json")

user_id = credentials['user_id']
client_id = credentials['client_id']
client_secret = credentials['client_secret']
redirect_uri = credentials['redirect_uri']
scopes = credentials['scopes']

os.environ['GROQ_API_KEY'] = credentials['groq_api_key']

In [None]:
content = json.dumps(data, indent=4)

payload = {
    "description": "Descrição atualizada do Gist",
    "files": {
        "report.json": {
            "content": content
        }
    }
}

headers = {
    "Authorization": f"token {GIST_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

response = requests.patch(json_file, json=payload, headers=headers)

if response.status_code == 200:
    print("Gist editado com sucesso!")
    print("Resposta:", response.json())
else:
    print(f"Falha ao editar o Gist. Status code: {response.status_code}")
    print("Erro:", response.json())

In [None]:
# self.dataframe = self._define_dataframe(output)

# def _identify(self, title: str) -> bool:
#     if 'original_title' in self.dataframe.columns and title in self.dataframe['original_title'].values:
#         return True
#     return False

# def _define_dataframe(self, output: str) -> pd.DataFrame:
#     if not output.lower().endswith('.xlsx'):
#         raise ValueError(f"O arquivo de saída deve ter a extensão .xlsx. Arquivo fornecido: {output}")
    
#     if os.path.exists(output):
#         return pd.read_excel(output)
    
#     return pd.DataFrame()

In [8]:
class App():
    def __init__(self, channel_url: str, full: bool, output: str, database: Dict[str, str], temperature: float):            
        self.channel_url = channel_url
        self.channel_name = channel_url.rsplit("/")[-2].replace("@", "")
        self.full = full
        self.output = output
        self.database = database
        self.titles = self._get_content()
        self.model_name = "llama-3.3-70b-versatile"
        self.temperature = temperature
        self.chat = self._instance_model()
    
    def _instance_model(self) -> callable:
        class MusicDetails(BaseModel):
            artist: str
            track: str
            title: str
            
        llm = ChatGroq(model_name=self.model_name, temperature=self.temperature)
        parser = JsonOutputParser(pydantic_object=MusicDetails)
        prompt = ChatPromptTemplate.from_messages([
            ("system", """"You are a JSON extraction assistant. Always respond with a valid JSON using the structure below.\n
            If there are no explicit mentions of a song (artist name and/or track title), return unknown for the fields.

                {{
                    "artist": "artist name here",
                    "track": "track name here",
                    "title": "full title here, artist + track"
                }}"""),
            ("user", "{input}")
        ])
        
        return prompt | llm | parser
       
        
    def _identify(self, title: str) -> bool:
        for data in self.database:
            if 'original_title' in data.keys() and title == data['original_title']:
                return True
        return False
        

    def _parse(self, html: str) -> List[str]:
        soup = BeautifulSoup(html, "html.parser")
        titles_elements = soup.find_all(id="video-title")
    
        titles = [title.get_text(strip=True) for title in titles_elements]
        
        return [title for title in titles if not self._identify(title)]


    def _get_content(self) -> List[str]:

        if not self.channel_url.startswith('http'):
            raise ValueError(f"O endereço informado é invalido. Endereço fornecido: {self.channel_url}")
        
        options = Options()
        options.add_argument("--headless")

        with webdriver.Firefox(options=options) as driver:
            driver.get(self.channel_url)
            time.sleep(3)

            if self.full:
                last_height = driver.execute_script("return document.documentElement.scrollHeight")

                while True:
                    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
                    time.sleep(2)
                    new_height = driver.execute_script("return document.documentElement.scrollHeight")

                    if new_height == last_height:
                        break
                    last_height = new_height

            html = driver.page_source

        return self._parse(html)
    
    
    def _get_wait_time(self, error: str) -> int:
        try:
            seconds = str(error).split('Please try again in ')[-1].split('s', 1)[0]
            seconds = float(seconds.replace('.', '').replace('m', '.'))
            return int(seconds * 60)
        except Exception:
            logging.warning("Failed to extract wait time from error message.")
            return 120
    
    
    def _ask(self, description: str) -> Dict[str, str]:
        default_response = {
            "artist": "Unknown",
            "track": "Unknown",
            "title": "Unknown",
        }
        
        try:
            return self.chat.invoke(input=description)
        except OutputParserException:
            logging.warning(f"Failed to parse response for: {description}")
            return default_response
        except Exception as error:
            wait_time = self._get_wait_time(error)
            logging.error(f"Error occurred. Retrying in {wait_time} seconds.")
            time.sleep(wait_time)
            try:
                return self.chat.invoke(input=description)
            except Exception:
                logging.error(f"Final failure for input: {description}")
                return default_response
            
            
    def _get_new_tracks(self) -> List[Dict[str, str]]:
        new_tracks = []
        
        for title in tqdm(self.titles, desc="Extracting", ncols=80):
            music_obj = self._ask(title)

            for key in music_obj.keys():
                music_obj[key] = music_obj[key].title()

            music_obj['original_title'] = title
            music_obj['channel'] = self.channel_name
            
            new_tracks.append(music_obj)
        
        return new_tracks
    
    
    def run(self):
        new_tracks = self._get_new_tracks()
        return new_tracks

In [9]:
channel_url = "https://www.youtube.com/@GreatStonedDragon/videos"
full = False
output = 'report.xlsx'
temperature = 0.7

app = App(
    channel_url=channel_url,
    full=full,
    output=output,
    database=database,
    temperature=temperature
)

data = app.run()

Extracting: 100%|███████████████████████████████| 10/10 [00:11<00:00,  1.17s/it]


In [10]:
tracks = data.copy()

In [11]:
import urllib.parse
import webbrowser
from http.server import BaseHTTPRequestHandler, HTTPServer

def get_spotify_auth_code(client_id, redirect_uri, scopes):
    auth_url = 'https://accounts.spotify.com/authorize'
    params = {
        'client_id': client_id,
        'response_type': 'code',
        'redirect_uri': redirect_uri,
        'scope': scopes,
        'prompt': 'consent'
    }
    authorization_url = f"{auth_url}?{urllib.parse.urlencode(params)}"
    
    webbrowser.open(authorization_url)
    
    class RequestHandler(BaseHTTPRequestHandler):
        def do_GET(self):
            query = urllib.parse.urlparse(self.path).query
            params = urllib.parse.parse_qs(query)
            print(params)
            if 'code' in params:
                self.send_response(200)
                self.end_headers()
                self.wfile.write(b"Autorizacao concluida! Pode fechar esta janela.")
                self.server.auth_code = params['code'][0]
            else:
                self.send_response(400)
                self.end_headers()
                self.wfile.write(b"Codigo de autorizacao nao encontrado.")
    
    server = HTTPServer(('localhost', 3000), RequestHandler)
    print("Aguardando autorizacao...")
    server.handle_request()  # Aguarda o código e fecha a conexão
    
    return getattr(server, 'auth_code', None)


In [None]:
# spotify_credentials = {
#     client_id, 
#     redirect_uri, 
#     scopes
# }

# code = get_spotify_auth_code(*spotify_credentials)
# code
code = get_spotify_auth_code(client_id, redirect_uri, scopes)

In [17]:
code = 'AQCywm_oMDLCh6vC_0uVNo5mYGi6-4B_Pj5MCHhW4SYuDpsLUHqFIrfLmm-MeQKYxneVJoYFfusyyLUstsCtFwW_E9Z5zSi0V_HQsw5BMEstwtLBZbld4TEC41KRcJBQTgMtYSN-YAe6W98YhS6MjqZLjF4gUFKkwkhbqGsjmyoYsNOwBQJSbA3n-iYqYP7kWOxx5maPzuzjUxVFAVFay1b0nH2zzlk7NT0hVm3bnsIK0hI8bA'

In [13]:
url = 'https://accounts.spotify.com/api/token'

data = {
    'grant_type': 'authorization_code',
    'code': code,
    'redirect_uri': redirect_uri,
    'client_id': client_id,
    'client_secret': client_secret
}

response = requests.post(url, data=data)
access_token = response.json().get('access_token')

NameError: name 'code' is not defined

In [19]:
headers = {'Authorization': f'Bearer {access_token}'}
playlist_url = f'https://api.spotify.com/v1/users/{user_id}/playlists'

In [21]:
def get_playlist(playlist_url, headers) -> str:
    user_playlists = requests.get(playlist_url, headers=headers).json()

    for playlist in user_playlists['items']:
        if 'Youtube Scrapping' in playlist['name']:
            return playlist['id']
        else:
            return requests.post(playlist_url, headers=headers, json={
                'name': 'Youtube Scrapping',
                'description': 'Musicas que foram retiradas do Youtube',
                'public': True
            }).json()['id']

In [22]:
playlist_id = get_playlist(playlist_url, headers)

In [112]:
search_url = 'https://api.spotify.com/v1/search'


for track in tracks:
    artist = track['artist']
    music = track['track']
    
    music_metadata = requests.get(search_url, headers=headers, params={
        'q': f'artist:{artist} track:{music}',
        'type': 'track'
    }).json()['tracks']['items']

    if music_metadata:
        music_uri, new_track, new_artist = music_metadata[0]['uri'], music_metadata[0]['name'], music_metadata[0]['artists'][0]['name']
        
        playlist = requests.get(get_itens_url, headers=headers).json()['items']

        existing_tracks = {(item['track']['name'], item['track']['artists'][0]['name']) for item in playlist}

        if (new_track, new_artist) not in existing_tracks:
            response = requests.post(add_item_url, headers=headers, json={'uris': [music_uri]})
            
            if response.status_code == 201:
                print(f"Adicionado: {new_track} - {new_artist}")
            else:
                print(f"Erro ao adicionar música: {response.status_code} - {response.text}")


In [104]:
add_item_url = f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks'
get_itens_url = f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks'

In [None]:
artist = 'Sleep Token'
music = 'Hallelujah'
artist = 'Bilmuri'
music = 'Emptyhanded'



if metadata:
    music_to_add = metadata[0]
else:
    pass

music_to_add

In [None]:
# else:
df = pd.DataFrame(content)
df.to_excel("output.xlsx") 