In [146]:
import os
import json
import time
import pandas as pd

from bs4 import BeautifulSoup
from pydantic import BaseModel

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options

from groq import Groq

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [154]:
class MusicDetails(BaseModel):
    artist: str
    track: str
    title: str
    original_title: str

In [166]:
llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    temperature=0.7
)

# parser = JsonOutputParser(pydantic_object={
#     "type": "object",
#     "properties": {
#         "artist": {"type": "string"},
#         "track": {"type": "string"},
#         "title": {"type": "string"},
#         "original_title": {"type": "string"},
#     }
# })

parser = JsonOutputParser(pydantic_object=MusicDetails)

prompt = ChatPromptTemplate.from_messages([
    ("system", """Extract music details into JSON with this structure:
        {{
            "artist": "artist name here",
            "track": "track name here",
            "title": "full title here, artist + track",
            "original_title": "original title here"
        }}"""),
    ("user", "{input}")
])

chain = prompt | llm | parser

def chat(description: str) -> dict:
    try:
        chain.invoke({"input": description})
    except:
        return {"artist": "", "track": "", "title": "", "original_title": description}

In [159]:
def normalize_content(html):
    title_base = []
    soup = bs(html, "html.parser")
    titles = soup.find_all(id="video-title")

    for title in titles:
        title_cell = title.get_text(strip=True)
        
        title_base.append(title_cell)
    
    return title_base

In [160]:
def get_page_content(url, full=False):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    
    driver.get(url)
    time.sleep(3)
    
    if full:
        last_height = driver.execute_script("return document.documentElement.scrollHeight")

        while True:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(2)

            new_height = driver.execute_script("return document.documentElement.scrollHeight")
            
            if new_height == last_height:
                break
            
            last_height = new_height
    
    html = driver.page_source
    driver.quit()
    
    return normalize_content(html)

In [161]:
DOMAIN = 'https://www.youtube.com/'
CHANNEL = '@GreatStonedDragon'
URL = f'{DOMAIN}{CHANNEL}/videos'

In [163]:
titles = get_page_content(URL, True)

In [None]:
j_titles = []

for title in titles:
    if '||' in title:
        title = title.rsplit('||', 1)[0]

        response = chat(title)

        j_titles.append(response)
        print(response)

None
None
{'artist': '', 'track': '', 'title': '', 'original_title': 'Yo the switch 2 looks tight '}
{'artist': '', 'track': '', 'title': '', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - TAKE ME BACK TO EDEN "}
{'artist': '', 'track': '', 'title': '', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - RAIN "}
{'artist': '', 'track': '', 'title': '', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - DYWTYLM "}
{'artist': '', 'track': '', 'title': '', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - THE APPARITION "}
{'artist': '', 'track': '', 'title': '', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - ARE YOU REALLY OKAY? "}
{'artist': '', 'track': '', 'title': '', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - ASCENSIONISM "}
{'artist': '', 'track': '', 'title': '', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - Vore "}
{'artist': '', 'track': '', 'title': '', 'original_title': 'My first time trying to sing '

In [145]:
for title in j_titles:
    print(title)

{"artist": "SLEEP TOKEN", "track": "EUCLID", "title": "SLEEP TOKEN - EUCLID", "original_title": "Dragon is SLEEP TOKIN' || SLEEP TOKEN - EUCLID + more"}
{"artist": "Sleep Token", "track": "Bluegrass", "title": "Sleep Token - Bluegrass", "original_title": "Bluegrass"}
{'artist': 'Unkown', 'track': 'Unkown', 'title': 'Unkown', 'original_title': 'Yo the switch 2 looks tight '}
{"artist": "SLEEP TOKEN", "track": "TAKE ME BACK TO EDEN", "title": "SLEEP TOKEN - TAKE ME BACK TO EDEN", "original_title": "TAKE ME BACK TO EDEN"}
{"artist": "SLEEP TOKEN", "track": "RAIN", "title": "SLEEP TOKEN - RAIN", "original_title": "Dragon is SLEEP TOKIN' || SLEEP TOKEN - RAIN"}
{'artist': 'Unkown', 'track': 'Unkown', 'title': 'Unkown', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - DYWTYLM "}
{'artist': 'Unkown', 'track': 'Unkown', 'title': 'Unkown', 'original_title': "Dragon is SLEEP TOKIN' || SLEEP TOKEN - THE APPARITION "}
{'artist': 'Unkown', 'track': 'Unkown', 'title': 'Unkown', 'original_t

In [144]:
for title in j_titles:
    for key in ['artist', 'track', 'title']:
        title[key] = title[key].title()

    print(title)

IndentationError: expected an indented block after 'for' statement on line 2 (3378752758.py, line 3)

In [139]:
df = pd.json_normalize(j_titles)

df

Unnamed: 0,artist,track,title,original_title
0,Blood Incantation,The Stargate,Blood Incantation - The Stargate,The Stargate
1,Unknown,Unknown,Until Dawn Movie Trailer Reaction,UNTIL DAWN Movie trailer
2,The Warning,Queen Of The Murder Scene,The Warning - Queen Of The Murder Scene,Queen of the Murder Scene
3,The Browning,Blue (Da Ba Dee),The Browning - Blue (Da Ba Dee),Blue (Da Ba Dee)
4,Breaking Benjamin,Awaken,Breaking Benjamin - Awaken,BREAKING BENJAMIN - AWAKEN
5,King Gizzard And The Lizard Wizard,K.G.L.W.,King Gizzard And The Lizard Wizard - K.G.L.W.,K.G.L.W.
6,Sleep Token,Euclid,Sleep Token - Euclid,EUCLID
7,Bilmuri,Absolutelycrankinmymfinhog,Bilmuri - Absolutelycrankinmymfinhog,ABSOLUTELYCRANKINMYMFINHOG
8,Sleep Token,Missing Limbs,Sleep Token - Missing Limbs,Missing Limbs
9,Bad Omens And Too Close To Touch,Sympathy,Bad Omens And Too Close To Touch - Sympathy,Sympathy


In [316]:
response.sort()
titles = []

for title in response:
    new_title = None
    baked = 'i got baked'
    
    if baked in title.lower():
        if baked + ' and watched' in title.lower():
            continue
        
        if baked + ' and played' in title.lower():
            continue
        
        if 'live on' in title.lower():
            continue
        
        if 'live at' in title.lower():
            continue
        
        elif 'checked out new' in title.lower():
            new_title = title.split(' new ', maxsplit=1)[-1]
        elif 'checked out' in title.lower():
            new_title = title.split(' out ', maxsplit=1)[-1]
        elif 'reacted to' in title.lower() or 'listened to' in title.lower():
            new_title = title.split(' to ', maxsplit=1)[-1]
        elif 'bumped' in title.lower() or 'listened to' in title.lower():
            new_title = title.split(' bumped ', maxsplit=1)[-1]
        
        if not new_title:
            print(title)
            continue
        
        if '||' in new_title:
            parts = new_title.split('||')
            parts.pop()
            
            new_title = '-'.join(parts)
        
        new_title = new_title.title().replace('!', '')
        new_title = new_title.replace(' For The First Time ', '')
        
        content = {
            "title": title,
            "new_title": new_title.strip()
        }
        
        titles.append(content)
        
        
    
    # print(title)
    
df = pd.json_normalize(titles)

df


Unnamed: 0,title,new_title
0,I got baked and bumped KUBLAI KHAN TX || MUD |...,Kublai Khan Tx - Mud
1,I got baked and checked out BILMURI || THE END...,Bilmuri - The End
2,I got baked and checked out BILMURI || TOO LAT...,Bilmuri - Too Late
3,I got baked and checked out ELEINE for the fir...,Eleine- Never Forget
4,I got baked and checked out GORE. || Angels Li...,Gore. - Angels Like You
5,I got baked and checked out Holy Wars - 21st C...,Holy Wars - 21St Century Bitch
6,I got baked and checked out IMMINENCE - Death ...,Imminence - Death By A Thousand Cuts
7,I got baked and checked out IMMINENCE - Heaven...,Imminence - Heaven Shall Burn
8,I got baked and checked out IMMINENCE - TEMPTA...,Imminence - Temptation
9,I got baked and checked out JINJER - ROGUE || ...,Jinjer - Rogue


In [None]:
for item in response:
    ignore = ['trailer', 'movie', 'and watched', 'and played', 'live at', 'live on']
    
    title = item['title'].replace('for the first time', '').replace(' ! ', ' ')
    
    if any(keyword in title.lower() for keyword in ignore):
        continue
    
    if 'i got baked' in title.lower():
        continue
        parts = title.split(' to ', maxsplit=1)
        
        if len(parts) == 1:
            parts = title.split(' out ', maxsplit=1)
            
        if len(parts) == 1:
            parts = title.split(' bumped ', maxsplit=1)
        
        new_title = parts[-1]
        new_parts = new_title.split()
        
        
        if 'greatstone' in new_parts[-1].lower():
            new_parts.pop()
            
            new_title = ' '.join(new_parts)
            
        title = new_title
        
        continue
        
    print(title)
        

In [None]:
text = 'JINJER - ROGUE || GreatStonedReactions'

parts = text.split()

JINJER - ROGUE ||
