In [1]:
from selenium.webdriver.chrome.options import Options
from os import path, listdir, rename, getcwd
from fake_useragent import UserAgent
from dateutil.parser import parse
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import pickle
import time

In [2]:
def chrome_user_agent():
    return UserAgent().chrome

chrome_user_agent()

'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36'

In [3]:
def chrome_driver():
    options = Options()
    # options.add_argument('--headless')
    options.add_argument("window-size=1600,1080")
    options.add_argument(f'user-agent={chrome_user_agent()}')

    return webdriver.Chrome(options=options)

In [4]:
def socialblade(channel, driver):
    driver.get(f'https://socialblade.com/youtube/channel/{channel}')
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    time.sleep(np.random.randint(4,7))
    
    return soup.find('body').prettify()

In [5]:
def save_html(channel, soup):
    # open the file in w mode, set encoding to UTF-8
    with open(f"../download/socialblade/new/{channel}.html", "w") as f:
        f.write(str(soup))

In [6]:
def open_html(channel):
    with open(channel, 'r') as f:
        return f.read()

In [7]:
def check_channel(channel, channels_dict):
    # check if channel has been downloaded
    if channel not in channels_dict.keys():
        save_html(channel, socialblade(channel))
        channels_dict[channel] = 'downloaded'

    return channels_dict

In [8]:
def load_channels_dict():
    with open('../data/channels_dict', "rb") as f:
        channels_dict = pickle.load(f)
        return channels_dict

In [9]:
def save_channels_dict(channels_dict):
    # Save channels dictionary object
    with open('../data/channels_dict', "wb") as f:
        pickle.dump(channels_dict, f)

In [10]:
def reset_channels_dictionary():
    my_dict = load_channels_dict()

    for key in my_dict.keys():
        my_dict[key] = 'pending'

    for file in listdir('../download/socialblade/new'):
        my_dict[file[:-5]] = 'downloaded'

    for file in listdir('../download/socialblade/valid'):
        my_dict[file[:-5]] = 'valid'

    for file in listdir('../download/socialblade/invalid'):
        my_dict[file[:-5]] = 'invalid'

    save_channels_dict(my_dict)

In [14]:
my_dict = load_channels_dict()

p_counter, v_counter, i_counter, d_counter = 0,0,0,0

for key in my_dict.keys():
    if my_dict[key] == 'pending':
        p_counter += 1
    elif my_dict[key] == 'valid':
        v_counter += 1
    elif my_dict[key] == 'invalid':
        i_counter += 1
    elif my_dict[key] == 'downloaded':
        d_counter += 1

print(f'pending - {p_counter}')
print(f'valid - {v_counter}')
print(f'invalid - {i_counter}')
print(f'downloaded - {d_counter}')

pending - 0
valid - 616
invalid - 19
downloaded - 877


In [15]:
def download_missing_channels():
    driver = chrome_driver()
    driver.get('https://socialblade.com')
    time.sleep(7)

    channels_dict = load_channels_dict()

    for channel in channels_dict.keys():
        if channels_dict[channel] == 'pending':
            save_html(channel, socialblade(channel, driver))

            last = channel

            channels_dict[channel] = 'downloaded'
            save_channels_dict(channels_dict)

    driver.close()

In [16]:
download_missing_channels()

In [14]:
def process_channels():
    channels_dict = load_channels_dict()

    my_list = []

    for channel in channels_dict.keys():
        if channels_dict[channel] == 'downloaded':
            file_name = f'../download/socialblade/{channel}.html'

            channel_soup = BeautifulSoup(open_html(file_name))

            if channel_soup.find('div', {'id': 'graph-youtube-monthly-vidviews-container'}):
                channels_dict[channel] = 'valid'
        else:
            channels_dict[channel] = 'invalid'
    
    for channel in channels_dict.keys():
        if channels_dict[channel] == 'valid':
            file_name = f'../download/socialblade/{channel}.html'

            channel_soup = BeautifulSoup(open_html(file_name))

            top_info = channel_soup.findAll('div', {'class': 'YouTubeUserTopInfo'})
            
            name = channel_soup.find('div', {'id': 'YouTubeUserTopInfoBlockTop'}).find('h1').text.strip()
            uploads = int(top_info[0].findAll('span')[1].text.replace(',', ''))
            subscribers = top_info[1].findAll('span')[1].text.strip()
            total_views = int(top_info[2].findAll('span')[1].text.replace(',', ''))
            created = parse(top_info[5].findAll('span')[1].text.strip()).date()
            daily_avg_sub = channel_soup.find('div', {'id': 'averagedailysubs'}).text.strip().replace(',','')
            daily_avg_views = channel_soup.find('div', {'id': 'averagedailyviews'}).text.strip().replace(',','')
            
            my_list.append([channel, name, uploads, subscribers, total_views, created, daily_avg_sub, daily_avg_views])

    save_channels_dict(channels_dict)

    return my_list
    

In [15]:
df_columns = ['channel', 'name', 'uploads', 'subscribers', 'total_views', 'created', 'daily_avg_sub', 'daily_avg_views']
channels_df = pd.DataFrame(process_channels(), columns=df_columns).set_index('channel')

In [16]:
channels_df

Unnamed: 0_level_0,name,uploads,subscribers,total_views,created,daily_avg_sub,daily_avg_views
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
UCSF0PqIaps7jindnj6ICTfQ,EyeQew,370,6.64K,1179397,2019-05-17,+12,+2124
UCiDLfLRyN09V7k14MLf_56Q,MxghtyJxstin,390,26.8K,3945939,2019-07-07,+67,+21948
UCIHpnxcP9qGozuEJMYwrJWA,iSighttt,27,245,12160,2020-03-21,+1,+88
UCG-3jh0W8E4igpKFU2IV8UA,Shoobie and Toys,60,1.23K,630801,2019-11-07,+1,+2553
UC1r60D56JuOPv8BBUM0t8qA,Creeper Noob - Minecraft,356,376K,35097982,2020-01-16,+667,+80302
...,...,...,...,...,...,...,...
UCJ3eQY5XU9IqFWymrQlDDAA,Retr0 _ONG,49,20,250,2019-12-20,--,--
UCwhYxesK3zq7ML41AlBsqVg,BloodRodd3097,94,246,15483,2020-02-24,+1,+60
UCvCKPrYpQdO6wtiT9G4RJLw,ChickenDock6549 Gaming,81,155,8108,2019-05-28,+1,+45
UCUKEGjiSoFnemBsCLKrdeJA,BoardGameCo,548,25.6K,3624607,2019-12-11,+74,+14521


In [17]:
channels_df.daily_avg_sub.replace('--', '0', inplace=True)
channels_df.daily_avg_sub.replace('\+', '', inplace=True, regex=True)

channels_df.daily_avg_views.replace('--', '0', inplace=True)
channels_df.daily_avg_views.replace('\+', '', inplace=True, regex=True)

In [18]:
channels_df['subscribers'] = channels_df['subscribers'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
channels_df['daily_avg_sub'] = channels_df['daily_avg_sub'].replace({'K': '*1e3'}, regex=True).map(pd.eval).astype(int)
channels_df['daily_avg_views'] = channels_df['daily_avg_views'].replace({'K': '*1e3'}, regex=True).map(pd.eval).astype(int)

In [19]:
channels_df['age'] = (pd.Timestamp.today().date() - channels_df.created).dt.days

In [20]:
channels_df = channels_df[['total_views', 'uploads', 'subscribers', 'daily_avg_sub', 'daily_avg_views', 'age']]

In [21]:
def save_channels_df(df):
    # Save channels dictionary object
    with open('../data/channels_df', "wb") as f:
        pickle.dump(channels_df, f)

In [22]:
# save_channels_df(channels_df)