In [None]:
import sys
sys.path.insert(0, '..')
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import json
import sys
import pandas as pd
import re
import time

In [None]:
from src.utils import get_full_url, AuditVideo

In [None]:
class Audit(object):
    """
    Static class that includes configuration for our Audit process
    """

    #AUDIT VARIABLES
    USER_AGE = 'young' #'young' or 'old'
    FINANCE_VIDEO_TYPE = 'traditional' #'traditional', 'blockchain', 'mixed', 'unrelated'

    WATCH_BY_RATIO = False #If True, watch videos by ratio of video duration, else watch by number of seconds
    WATCH_DURATION = 10 #Number of seconds to watch each video
    WATCH_RATIO = 0.5 #Ratio of video duration to watch

    NUM_RECOMMENDATIONS = 10 #Number of recommendations to collect, very large values may throw errors

    YOUNG_SEED_AGE_VIDEO_PATH = '../data/seed/youtube/young_videos.csv'
    OLD_SEED_AGE_VIDEO_PATH = '../data/seed/youtube/old_videos.csv'
    SEED_FINANCE_VIDEO_PATH = '../data/seed/youtube/seed_videos.csv'
    # SEED_AGE_VIDEO_PATH = '../data/raw/youtube/seed_age_videos.csv'
    # SEED_FINANCE_VIDEO_PATH = '../data/raw/youtube/seed_finance_videos.csv'

    VIDEO_ID_COLUMN = 'id'

    AUDIT_RESULTS_PATH = '../data/audit/youtube/'

In [None]:
import datetime
df = pd.read_csv(Audit.SEED_FINANCE_VIDEO_PATH, index_col=0)
# Convert duration to seconds
df['duration'] = df['duration'].apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time())
df['duration'] = df['duration'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
df

In [None]:
#from config import audit

def get_age_seed_videos():
    if Audit.USER_AGE == 'young':
        df = pd.read_csv(Audit.YOUNG_SEED_AGE_VIDEO_PATH, index_col=0)
    elif Audit.USER_AGE == 'old':
        df = pd.read_csv(Audit.OLD_SEED_AGE_VIDEO_PATH, index_col=0)
    else:
        raise ValueError('User age must be "young" or "old"')
    
    # Convert duration to seconds
    df['duration'] = df['duration'].apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time())
    df['duration'] = df['duration'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    durations = df['duration'].values
    
    #df = df[df['label'] == Audit.USER_AGE]
    video_ids = df[Audit.VIDEO_ID_COLUMN].values
    video_urls = [get_full_url(video_id) for video_id in video_ids]
    return video_urls, durations

def get_finance_seed_videos():
    df = pd.read_csv(Audit.SEED_FINANCE_VIDEO_PATH, index_col=0)
    #df = df[df['label'] == Audit.FINANCE_VIDEO_TYPE]

    # Convert duration to seconds
    df['duration'] = df['duration'].apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time())
    df['duration'] = df['duration'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
    durations = df['duration'].values

    video_ids = df[Audit.VIDEO_ID_COLUMN].values
    video_urls = [get_full_url(video_id) for video_id in video_ids]
    return video_urls, durations

def process_durations_list(durations):
    if Audit.WATCH_BY_RATIO:
        durations = [int(duration * Audit.WATCH_RATIO) for duration in durations]
    else:
        durations = [Audit.WATCH_DURATION for duration in durations]
    return durations

get_age_seed_videos()

In [None]:
from src.YTDriver import YTDriver

In [None]:
def run_audit():

    #Record start time
    start_time = time.time()
    driver = YTDriver(browser='firefox', verbose=True)

    #Watch age seed videos
    age_seed_videos, age_seed_video_durations = get_age_seed_videos()
    age_seed_video_durations = process_durations_list(age_seed_video_durations)
    driver.play_list(age_seed_videos, age_seed_video_durations, homepage_interval=0, topn=Audit.NUM_RECOMMENDATIONS)
    to_csv(driver, start_time)

    #Watch finance videos
    finance_seed_videos, finance_seed_video_durations = get_finance_seed_videos()
    finance_seed_video_durations = process_durations_list(finance_seed_video_durations)
    driver.play_list(finance_seed_videos, finance_seed_video_durations, homepage_interval=10, topn=Audit.NUM_RECOMMENDATIONS)

    driver.close() #Only closes the browser, object and results are still available

    #Save results to csv
    to_csv(driver, start_time)
run_audit()

In [None]:
def to_csv():
    start_time_str = str(start_time).replace('.', '') #Remove period from start time for filenames

    video_recs_df = pd.DataFrame(driver.video_recs)
    video_recs_df['Start Time'] = start_time
    video_recs_df['Age'] = Audit.USER_AGE
    video_recs_df['Finance Video Type'] = Audit.FINANCE_VIDEO_TYPE
    video_recs_df.to_csv(Audit.AUDIT_RESULTS_PATH + f'{start_time_str}-video_recs.csv', index=False)

    homepage_recs_df = pd.DataFrame(driver.homepage_recs)
    homepage_recs_df['Start Time'] = start_time
    homepage_recs_df['Age'] = Audit.USER_AGE
    homepage_recs_df['Finance Video Type'] = Audit.FINANCE_VIDEO_TYPE
    homepage_recs_df.to_csv(Audit.AUDIT_RESULTS_PATH + f'{start_time_str}-homepage_recs.csv', index=False)

In [None]:
def scrape_video(url):
    """

    Args:
        url (str): Must be a url for a valid YouTube video
    """
    assert('youtube.com' in url.lower())
    
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(5)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    elements = driver.find_elements(By.ID, 'video-title')
    out = []
    for elem in elements[:-1]: #Last element is not actually a video
        title = elem.get_attribute('innerHTML').strip()
        link = elem.find_element(By.XPATH, "./../..").get_attribute('href') #Extracts link to video
        
        row = {'title': title, 'link': link, 'source': url}
        out.append(row)
        
    print(out)
    driver.close()
    return out

def scrape_ytdriver(filepath, recursions=5):
    driver = YTDriver(browser='firefox', verbose=True)

    videos = []
    cur_videos = driver.get_homepage()
    if not cur_videos:
        cur_videos = driver.get_homepage()
    videos += cur_videos
    
    for _ in range(recursions):
        driver.play(cur_videos[0], 10)

        cur_videos = driver.get_recommendations()
        videos += cur_videos
            
    driver.close()

    video_urls = [video.url for video in videos]
    video_urls.insert(0, 'https://www.youtube.com') #Demark start of sessions
    
    list_to_csv(video_urls, filepath)
    return video_urls