# **Data Scraping**

### **Selenium**

In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.14.0-py3-none-any.whl (9.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/9.9 MB[0m [31m5.9 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/9.9 MB[0m [31m27.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m9.3/9.9 MB[0m [31m89.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.22.2-py3-none-any.whl (400 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.2/400.2 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocke

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pandas as pd
import time
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
# will resolve driver compatibility issues
def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

In [None]:
driver = web_driver()

In [None]:
driver.get("https://www.youtube.com/@UnfoldDataScience")

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService

In [None]:
channel_url = "https://www.youtube.com/@UnfoldDataScience"
chrome_service = ChromeService("path/to/chromedriver")  # Replace with the path to your ChromeDriver executable

def scroll_down():
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
    time.sleep(2)

def scrape_video_info(video_element):
    title = video_element.find_element(By.ID, "video-title").get_attribute("title")
    views = int(video_element.find_element(By.CLASS_NAME, "style-scope ytd-grid-video-renderer").text.split()[0].replace(",", ""))
    likes = int(video_element.find_element(By.CLASS_NAME, "style-scope ytd-grid-video-renderer").text.split()[2].replace(",", ""))
    upload_date = video_element.find_element(By.CLASS_NAME, "style-scope ytd-grid-video-renderer").text.split()[-1]
    comments = int(video_element.find_element(By.CLASS_NAME, "style-scope ytd-grid-video-renderer").text.split()[-3].replace(",", ""))
    return {"Title": title, "Views": views, "Likes": likes, "Upload Date": upload_date, "Comments": comments}

driver.get("https://www.youtube.com/@UnfoldDataScience")
time.sleep(5)

scroll_down()

video_elements = driver.find_elements(By.ID, "dismissable")

filtered_videos = []
for video_element in video_elements:
    video_info = scrape_video_info(video_element)
    upload_date = pd.to_datetime(video_info["Upload Date"])
    if pd.Timestamp("2019-09-10") <= upload_date <= pd.Timestamp("2023-09-10"):
        filtered_videos.append(video_info)

df = pd.DataFrame(filtered_videos)

df.to_csv("youtube_data.csv", index=False)

driver.quit()

In [None]:
def calculate_average_views_last_30_days(df):
    current_date = datetime.now()
    last_30_days = current_date - pd.DateOffset(days=30)
    recent_videos = df[pd.to_datetime(df["Upload Date"]) >= last_30_days]
    average_views = recent_videos["Views"].mean()
    return average_views

In [None]:
def find_video_with_highest_ratio(df):
    df["Likes-to-Views Ratio"] = df["Likes"] / df["Views"]
    video_with_highest_ratio = df[df["Likes-to-Views Ratio"] == df["Likes-to-Views Ratio"].max()]
    return video_with_highest_ratio

In [None]:
def find_likes_dislikes_correlation(df):
    correlation = df["Likes"].corr(df["Comments"])
    return correlation

In [None]:
def find_most_common_upload_day(df):
    df["Upload Date"] = pd.to_datetime(df["Upload Date"])
    df["Upload Day"] = df["Upload Date"].dt.day_name()
    most_common_day = df["Upload Day"].mode()[0]
    return most_common_day

In [None]:
def detect_view_count_outliers(df):
    z_scores = stats.zscore(df["Views"])
    outliers = df[abs(z_scores) > 3]
    return outliers

In [None]:
def load_data(file_path):
    return pd.read_csv(file_path)

In [None]:
if __name__ == "__main__":
    youtube_data = load_data("youtube_data.csv")

    average_views_last_30_days = calculate_average_views_last_30_days(youtube_data)
    print(f"Average views count per video in the last 30 days: {average_views_last_30_days:.2f}")

    highest_ratio_video = find_video_with_highest_ratio(youtube_data)
    print("Video with the highest likes-to-views ratio:")
    print(highest_ratio_video[["Title", "Likes-to-Views Ratio"]])

    likes_comments_correlation = find_likes_dislikes_correlation(youtube_data)
    print(f"Correlation between likes and comments: {likes_comments_correlation:.2f}")

    most_common_day = find_most_common_upload_day(youtube_data)
    print(f"Most common day of the week for video uploads: {most_common_day}")

    view_count_outliers = detect_view_count_outliers(youtube_data)
    print("Outliers in views count:")
    print(view_count_outliers[["Title", "Views"]])

    plt.boxplot(youtube_data["Views"])
    plt.title("View Count Boxplot")
    plt.show()