In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd
import datetime
import os

# Get the current date and time
current_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")
if not os.path.exists(current_datetime):
    os.makedirs(current_datetime)

In [3]:
print(current_datetime)

2024-08-11 19-27-06


## Initialize WebDriver

In [4]:
# Initialize WebDriver with the path to ChromeDriver using Service
driver_path = r"C:\Users\bayaz\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

## Get Video Title from chanels

In [5]:
def get_video_title_from_channel_url(channel_url):    
    # Open YouTube channel
    # channel_url = "https://www.youtube.com/@Apple"
    driver.get(channel_url)
    
    # Wait for the page to load
    time.sleep(5)
    
    # Scroll down to load more content
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    
    # Example: Scraping the number of subscribers
    try:
        subscribers = driver.find_element(By.XPATH, '//span[contains(@class, "yt-content-metadata-view-model-wiz__metadata-text") and contains(text(), "subscribers")]').text
        print(f"Subscribers: {subscribers}")
    except Exception as e:
        print(f"Error finding subscribers: {e}")
    
    # Example: Scraping recent video statistics
    videos = driver.find_elements(By.XPATH, '//*[@id="video-title"]')
    
    print(len(videos))
    return videos

# get_video_title_from_channel_url('https://www.youtube.com/@amazon')


## Load the video page

In [6]:
def get_html_from_video_link(video_url):
      
    # Open the YouTube video page
    driver.get(video_url)
    
    # Wait for the page to load
    time.sleep(5)
    # Pause the video using JavaScript
    pause_script = """
        var video = document.querySelector('video');
        if (video) {
            video.pause();
        }
    """
    driver.execute_script(pause_script)
    
    time.sleep(5)
    
    # Get the page source
    html = driver.page_source
    # print(html)
    return html



## Get Number

In [7]:
def get_number(str):
    # Dictionary to map suffixes to their respective multipliers
    multipliers = {'k': 1_000, 'K': 1_000, 'm': 1_000_000, 'M': 1_000_000, 'b': 1_000_000_000, 'B': 1_000_000_000}

    if not str.strip():
        return 0
    elif str[-1] in multipliers:
        # Extract the number and the suffix
        num = float(str[:-1].replace(',', '.'))
        multiplier = multipliers[str[-1]]
        return int(num * multiplier)
    else:
        # If there's no suffix, just return the integer value
        return int(str.replace(',', ''))

# print(get_number('  '))

## Find the views Count

In [8]:
def get_views_count(_soup):
    ### Find the views count element
    try:
        info_element = _soup.find('yt-formatted-string', {'id': 'info'})
        if info_element:
            views_element = info_element.find('span', {'class': 'bold style-scope yt-formatted-string'})
            if views_element:
                views_text = views_element.get_text(strip=True)
                # print(f"Views: {views_text}")
            else:
                print("Could not find views element.")
                return 0
        else:
            print("Could not find info element.")
            return 0
    except Exception as e:
        print(f"Error finding views: {e}")
        return 0

    return get_number(views_text.split(' ')[0])


## Find the likes count

In [9]:
def get_like_count(_soup):
    ### Finding the likes on a video
    
    factoid_element = _soup.find('factoid-renderer', {'class': 'YtwFactoidRendererHost'})
    try:
        # Extract the number of likes
        if factoid_element:
            like_count_element = factoid_element.find('span', {'class': 'yt-core-attributed-string yt-core-attributed-string--white-space-pre-wrap'})
            if like_count_element:
                like_count = like_count_element.get_text(strip=True)
                # print(f"Likes: {like_count}")
            else:
                print("Could not find like count element.")
                return 0
        else:
            print("Could not find factoid element.")
            return 0
    except Exception as e:
        print(f"Error finding views: {e}")
        return 0

    return get_number(like_count)


## Find the comment count

In [10]:
def get_comment_count(_soup):
    ### find comments
    # Find the element containing the comment count
    comment_count_element = _soup.findAll('yt-formatted-string', {'class': 'style-scope ytd-engagement-panel-title-header-renderer'}, id='contextual-info')
    # print(f'Comment count e : {comment_count_element}')
    try:
        # Extract the text from the element containing the number of comments
        if comment_count_element[1]:
            comment_count = comment_count_element[1].text.strip()
            # print(f"Number of comments: {comment_count}")
            # print(comment_count_element[1].text)
        else:
            print("Could not find the comment count element.")
            return 0
    except Exception as e:
        print(f"Error finding views: {e}")
        return 0
    

    return get_number(comment_count)


## Load Channel Data 

In [13]:
# channel_url = "https://www.youtube.com/@IBM"
# channel_name = "IBM"
def load_channel_data(channel_name, channel_url):
    videos_title = get_video_title_from_channel_url(channel_url)
    # get_video_title_from_channel_url('https://www.youtube.com/@amazon')
    
    video_title_list = list()
    video_url_list = list()
    for video in videos_title[:5]:
        video_title = video.get_attribute('title')
        video_url = video.get_attribute('href')
        video_title_list.append(video_title)
        video_url_list.append(video_url)
    
    # print(video_title_list)
    # print(video_url_list)
    
    info = list()
    for ind in range(len(video_url_list)):
        # video_title = video.get_attribute('title')
        # video_url = video.get_attribute('href')
        # print(f'title : {video_title}')
        html = get_html_from_video_link(video_url_list[ind])
        time.sleep(3)
        # Parse with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        views = get_views_count(soup)
        likes = get_like_count(soup)
        comments = get_comment_count(soup)
        info.append([video_title_list[ind], video_url_list[ind], views, likes, comments])
    
        print(f'ind : {ind} {video_title_list[ind]} views : {views} likes : {likes}, comments : {comments}')
        # driver.back()
        # time.sleep(1)

    # Convert the list of lists into a DataFrame
    df = pd.DataFrame(info, columns=['Video Title', 'Video URL', 'views', 'likes', 'comments'])
    
    # Specify the file name
    filename = f"{current_datetime}/{channel_name}_data.csv"
    
    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)
    
    print(f"Data saved to {filename}")

# load_channel_data('densy', 'https://www.youtube.com/@disneychannel')


Subscribers: 9.28M subscribers
60
ind : 0 Primos Chibi Tiny Tales | NEW CHIBI SHORT | The Summer of Silenciosa | @disneychannel views : 11000 likes : 348, comments : 0
Could not find views element.
Could not find factoid element.
Error finding views: list index out of range
ind : 1 🔴 LIVE! | Wizards of Waverly Place Season 1 Full Episodes | 21 THROWBACK Episodes | @disneychannel views : 0 likes : 0, comments : 0
ind : 2 Wizards Beyond Waverly Place First-Look 👀🪄 | NEW SERIES! | @disneychannel views : 312000 likes : 14000, comments : 0
ind : 3 Moon Girl’s Lab Shorts #8 | Save the Moon 🌕 | Marvel’s Moon Girl and Devil Dinosaur |@disneychannel views : 739698 likes : 428, comments : 0
ind : 4 Descendants 3 As Told By Chibi  #DisneyDescendants #ChibiTinyTales #DisneyChannel views : 35000 likes : 1700, comments : 0
Data saved to 2024-08-11 19-27-06/densy_data.csv


In [12]:
# Load the Excel file
file_path = 'data/data.csv'

try:
    # Attempt to read the file
    channels = pd.read_csv(file_path, encoding='ISO-8859-1')  # Adjust encoding if necessary

    # Assuming the columns are named 'Brand' and 'YouTube Link'
    if 'BRANDS' in channels.columns and 'YouTube' in channels.columns:
        for index, row in channels.iterrows():
            brand = row['BRANDS']
            youtube_link = row['YouTube']
            print(f"{index} Brand: {brand}, YouTube Link: {youtube_link} Going to Load: ")
            load_channel_data(brand, youtube_link)
    else:
        print("The required columns are not found in the CSV file.")
except Exception as e:
    print(f"An error occurred: {e}")
driver.quit()

0 Brand: Apple, YouTube Link: https://www.youtube.com/@Apple Going to Load: 


KeyboardInterrupt: 

In [None]:
# df = pd.DataFrame([[2,1],[4,5]], columns=['Video Title', 'Video URL'])
# filename = f"2024-08-11 18-40-26/Apple_data.csv"
# df.to_csv(filename, index=False)