# Import Libaries

In [None]:
import pandas as pd
import requests
import json
import re
import time
import os
import csv
import random
import pickle
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from multiprocessing import Pool

# Helper Functions

In [None]:
def make_url(class_type_list):
    """Return the a dictionary containing class links for each
    
    Key: class type
    Values: list of class links
    """
    urls_dict = {}
    url_template = 'https://www.skillshare.com/browse/%?sort=rating&seeAll=1'
    for class_type in class_type_list:
        urls_dict[class_type] = url_template.replace('%', class_type)
    return urls_dict

In [None]:
def get_class_link(class_types, urls_dict):
    for class_type in class_types:
        i=0
        print('Working on getting list of ' + class_type + ' classes')
        url = urls_dict[class_type]
        
        driver = webdriver.Chrome(chromedriver)
        driver.get(url)

        try:
            close = driver.find_element('xpath', "//a[contains(@class,'btn-close ss-icon-close')]")
            close.click()
        except:
            pass

        lastHeight = driver.execute_script("return document.body.scrollHeight")
        while True:
            i+=1
            print("Load ", i)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(120)
            newHeight = driver.execute_script("return document.body.scrollHeight")
            if newHeight == lastHeight:
                break
            lastHeight = newHeight

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        class_link_tags = soup.find_all('div', attrs={"class": "col-4 class-column rendered "})
        class_links = [class_link_tag.find('p').find('a')['href'] for class_link_tag in class_link_tags]
        class_links_dict[class_type] = class_links
        driver.close()

In [None]:
def get_links_from_txt(file):
    with open(file) as f:
        urls_dict = {}
        
        reader = f.readlines()
        
        for line in reader:
            isStart = True
            foo = line.strip().split(',')
            for item in foo:
                clean_item = item.strip().strip("[").strip("]").strip("'").strip()
                if isStart:
                    isStart = False
                    class_type = clean_item
                    urls_dict[class_type] = []
                else:
                    urls_dict[class_type].append(clean_item)
        return urls_dict

In [None]:
# get javascript that contains class info
def get_javascript_data(soup):
    projects_section = soup.find_all('script', attrs={"type": "text/javascript"})
    
    if not projects_section:
        return None
    
    for js in projects_section:
        try:
            if js.string:
                if re.search('.*SS.serverBootstrap =.*',js.text):
                    javascript = js.string
                    javascript = javascript.split("SS.serverBootstrap = ", 1)

                    javascript_data = javascript[1].split(";\n        ")[0]
                    javascript_data = json.loads(javascript_data)
        except:
            return None

    return javascript_data

# get author link
def get_title_and_author(soup):
    try:
        title = soup.find('title')
        return [item.strip() for item in title.text.split('|')]
    except:
        return []

# get teacher detail info from the javascripte
def get_detial_author_info(javascript_data):
    try:
        teacherInfo = javascript_data['pageData']['sectionData']['teacherInfo']
        return teacherInfo['fullName'], teacherInfo['headline'], teacherInfo['profileUrl']
    
    except:
        return None, None, None
    
# get class description info 
def get_class_description(soup):
    try:
        description_tag = (soup.find('div', attrs={"class": "about-this-class"})
                           .find('div', attrs={"class": "rich-content-wrapper"}))

        class_description = ""
        images, hyperlinks = 0, 0
        
        for paragraph in description_tag.findChildren():
            class_description += paragraph.text
            if paragraph.name == 'img':
                images += 1
            elif paragraph.name == 'a':
                hyperlinks += 1
                
        return len(class_description), images, hyperlinks
    
    except:
        return None, None, None
            
# get class length
def get_video_length(soup):
    try:
        video_content_tag = (soup.find('div', attrs={"class": "summary"})
                             .text.strip().strip('\n').strip())
        return video_content_tag #video_num, video_length
    except:
        None


# get tags linked to classes 
def get_tags(soup):
    tags = []
    try:
        tags_section = soup.find('div', attrs={"class": "tags-section"})
        for tag in tags_section.find_all('a'):
            tags.append(tag.text.strip())
    except:
        pass
    return tags
        
# get projects submitted for classes
def get_project_authors(javascript_data):    
    try:
        return [project['author']['fullName'] 
                for project in javascript_data['pageData']['sectionData']['topProjects']]
    except:
        return []

# get class sku as index
def get_class_sku(javascript_data):
    try:
        return javascript_data['classData']['sku']
    except:
        return None

# identify if class is free
def isPremium(javascript_data):
    try:
        return javascript_data['pageData']['headerData']['tagText'] == 'Premium class'
    except:
        return None

# get class start date
def get_start_date(javascript_data):
    try:
        return javascript_data['pageData']['syllabusData']['startTs']
    except:
        return None

# get enrollment number
def get_enrollment_number(javascript_data):
    try:
        return javascript_data['pageData']['sectionData']['numStudents']
    return None

def request_soup(url):
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response  = requests.get(url, headers = user_agent)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup

# Data Scraping

In [None]:
file = 'class_urls-1.csv'

try:
    urls_dict = get_links_from_txt(file)
except:
    urls_dict = make_url(class_types)
    with open('class_list_urls.csv', 'w') as f:
        writer = csv.writer(f)
        for key, val in urls_dict.items():
            writer.writerow([key, val])

In [None]:
columns = ['class_name',
          'teacher',
          'teacher_title',
          'teacher_profile',
          'description_length',
          'description_image_number',
          'description_link_number',
          'class_length',
          'tags',
          'sample_project',
          'class_sku',
          'paid_class',
          'start_date',
          'enrollment_number']
class_data = {col: [] for col in columns}

In [None]:
for class_type in class_types:
    
    print("working on ", class_type)
    class_data = {col: [] for col in columns}
    
    i = 0
    
    for this_class in urls_dict[class_type]:        
        time.sleep(5+2*random.random())
        
        try:
            soup = request_soup(this_class)            
        except:
            continue
            
        if not soup:
            continue
        
        isSaved = False
        while response.status_code != 200 or re.search('Access denied.*', soup.find('title').text):
            print("stop here")

            if not isSaved:
                pickle_title = class_type + ".pickle"
                pickle_out = open(pickle_title, "wb")
                pickle.dump(class_data, pickle_out)
                pickle_out.close()
                isSaved = True
                
            time.sleep(60*5)
            soup = request_soup(this_class) 
            
        i += 1
        if i%10 == 0:
            print(i)

        javascript_data = get_javascript_data(soup)

        class_data['class_name'].append(get_title_and_author(soup)[0])

        teacher, teacher_title, teacher_profile = get_detial_author_info(javascript_data)
        class_data['teacher'].append(teacher)
        class_data['teacher_title'].append(teacher_title)
        class_data['teacher_profile'].append(teacher_profile)

        description_length, description_image_number, description_link_number = get_class_description(soup)
        class_data['description_length'].append(description_length)
        class_data['description_image_number'].append(description_image_number)
        class_data['description_link_number'].append(description_link_number)

        class_data['class_length'].append(get_video_length(soup))

        class_data['tags'].append(get_tags(soup))
        class_data['sample_project'].append(teacher in get_project_authors(javascript_data))
        class_data['class_sku'].append(get_class_sku(javascript_data))
        class_data['paid_class'].append(isPremium(javascript_data))
        class_data['start_date'].append(get_start_date(javascript_data))
        class_data['enrollment_number'].append(get_enrollment_number(javascript_data))

    pickle_title = class_type + ".pickle"
    pickle_out = open(pickle_title,"wb")
    pickle.dump(class_data, pickle_out)
    pickle_out.close()

# __Appendix__

In [None]:
class_types = ['fine-art',
               'photography',
               'graphic-design',
               'illustration',
               'writing',
               'music-production',
               'animation',
               'ui-ux-design',
               'film-production',
               'marketing',
               'entrepreneurship',
               'productivity',
               'finance',
               'freelance',
               'business-analytics',
               'management',
               'leadership',
               'sales',
               'human-resources',
               'accounting',
               'web-development',
               'mobile-development',
               'it-security',
               'data-science',
               'game-design',
               'product-management',
               'crafts',
               'culinary',
               'health-and-wellness',
               'other',
               'teaching',
               'home-business',
               'languages',
               'gaming']

pickle.dump(class_types, open("class_types.pickle", "wb"))