In [1]:
import pandas as pd
import requests
import json
import re
import time
import os
import csv
import random
import pickle
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from multiprocessing import Pool

In [2]:
class_types = ['fine-art',
               'photography',
               'graphic-design',
               'illustration',
               'writing',
               'music-production',
               'animation',
               'ui-ux-design',
               'film-production',
               'marketing',
               'entrepreneurship',
               'productivity',
               'finance',
               'freelance',
               'business-analytics',
               'management',
               'leadership',
               'sales',
               'human-resources',
               'accounting',
               'web-development',
               'mobile-development',
               'it-security',
               'data-science',
               'game-design',
               'product-management',
               'crafts',
               'culinary',
               'health-and-wellness',
               'other',
               'teaching',
               'home-business',
               'languages',
               'gaming']


In [3]:
def make_url(class_type_list):
    urls_dict = {}
    url_template = 'https://www.skillshare.com/browse/%?sort=rating&seeAll=1'
    for class_type in class_type_list:
        urls_dict[class_type] = url_template.replace('%', class_type)
    return urls_dict

#urls_dict = make_url(class_types)

#with open('class_list_urls.csv', 'w') as f:
#    writer = csv.writer(f)
#    for key, val in urls_dict.items():
#        writer.writerow([key, val])


In [4]:
def get_class_link(class_types, urls_dict):
    for class_type in class_types:
        i=0
        print('Working on getting list of ' + class_type + ' classes')
        url = urls_dict[class_type]
        
        driver = webdriver.Chrome(chromedriver)
        driver.get(url)

        try:
            close = driver.find_element('xpath', "//a[contains(@class,'btn-close ss-icon-close')]")
            close.click()
        except:
            pass

        lastHeight = driver.execute_script("return document.body.scrollHeight")
        while True:
            i+=1
            print("Load ", i)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(120)
            newHeight = driver.execute_script("return document.body.scrollHeight")
            if newHeight == lastHeight:
                break
            lastHeight = newHeight

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        class_link_tags = soup.find_all('div', attrs={"class": "col-4 class-column rendered "})
        class_links = [class_link_tag.find('p').find('a')['href'] for class_link_tag in class_link_tags]
        class_links_dict[class_type] = class_links
        driver.close()

In [5]:
def get_links_from_txt(file):
    with open(file) as f:
        urls_dict = {}
        
        reader = f.readlines()
        
        for line in reader:
            isStart = True
            foo = line.strip().split(',')
            for item in foo:
                clean_item = item.strip().strip("[").strip("]").strip("'").strip()
                if isStart:
                    isStart = False
                    class_type = clean_item
                    urls_dict[class_type] = []
                else:
                    urls_dict[class_type].append(clean_item)
        return urls_dict

In [6]:
def get_javascript_data(soup):
    projects_section = soup.find_all('script', attrs={"type": "text/javascript"})
    
    for js in projects_section:
        if js.string:
            if re.search('.*SS.serverBootstrap =.*',js.text):
                javascript = js.string
                javascript = javascript.split("SS.serverBootstrap = ", 1)

                javascript_data = javascript[1].split(";\n        ")[0]
                javascript_data = json.loads(javascript_data)

    return javascript_data

#get author link
def get_title_and_author(soup):
    title = soup.find('title')
    
    return [item.strip() for item in title.text.split('|')]

def get_detial_author_info(javascript_data):
    teacherInfo = javascript_data['pageData']['sectionData']['teacherInfo']
    
    return teacherInfo['fullName'], teacherInfo['headline'], teacherInfo['profileUrl']
    
#find media 
def get_class_description(soup):
    description_tag = soup.find('div', attrs={"class": "about-this-class"}).find('div', attrs={"class": "rich-content-wrapper"})
    
    class_description = ""
    
    images, hyperlinks = 0, 0
    for paragraph in description_tag.findChildren():
        class_description += paragraph.text
        if paragraph.name == 'img':
            images += 1
        elif paragraph.name == 'a':
            hyperlinks += 1
            
    return len(class_description), images, hyperlinks

#extract info
def get_video_length(soup):
    video_content_tag = soup.find('div', attrs={"class": "summary"}).text.strip().strip('\n').strip()
    
    pattern_number = '(\d+)*'
    pattern_length = '(\d+)(?=m)'
    video_num = re.search(pattern_number, video_content_tag).group(0)
    video_length = re.search(pattern_length, video_content_tag).group(0)
    
    return video_num, video_length


# get link
def get_tags(soup):
    tags = []
    tags_section = soup.find('div', attrs={"class": "tags-section"})
    for tag in tags_section.find_all('a'):
        tags.append(tag.text.strip())
        
    return tags
        
def get_project_authors(javascript_data):    
    try:
        return [project['author']['fullName'] for project in javascript_data['pageData']['sectionData']['topProjects']]
    except:
        return None

def get_class_sku(javascript_data):
    return javascript_data['classData']['sku']

def isPremium(javascript_data):
    return javascript_data['pageData']['headerData']['tagText'] == 'Premium class'

def get_start_date(javascript_data):
    return javascript_data['pageData']['syllabusData']['startTs']

def get_enrollment_number(javascript_data):
    return javascript_data['pageData']['sectionData']['numStudents']

In [7]:
file = 'class_urls-1.csv'
urls_dict = get_links_from_txt(file)
#urls_dict.keys()
#class_types

In [8]:
columns = ['class_name',
          'teacher',
          'teacher_title',
          'teacher_profile',
          'description_length',
          'description_image_number',
          'description_link_number',
          'video_number',
          'video_length_min',
          'tags',
          'sample_project',
          'class_sku',
          'paid_class',
          'start_date',
          'enrollment_number']
class_data = {col: [] for col in columns}
class_data

{'class_name': [],
 'class_sku': [],
 'description_image_number': [],
 'description_length': [],
 'description_link_number': [],
 'enrollment_number': [],
 'paid_class': [],
 'sample_project': [],
 'start_date': [],
 'tags': [],
 'teacher': [],
 'teacher_profile': [],
 'teacher_title': [],
 'video_length_min': [],
 'video_number': []}

In [13]:
#def parse(class_type):
access_denied = False
for class_type in class_types[20:25]:
    if access_denied:
        break
            
    print("working on ", class_type)
    class_data = {col: [] for col in columns}
    
    i=0
    partition = 0
    
    for this_class in urls_dict[class_type]: 
        #print(this_class)
        time.sleep(.5+2*random.random())
        ua = UserAgent()
        user_agent = {'User-agent': ua.random}
        response  = requests.get(this_class, headers = user_agent)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        
        
        if response.status_code != 200 and re.search('Access denied.*', soup.find('title').text):
            partition += 1
            pickle_title = class_type + str(partition) + ".pickle"
            pickle_out = open(pickle_title,"wb")
            pickle.dump(class_data, pickle_out)
            pickle_out.close()
            print("stop here")
            access_denied = True
            #class_data = {col: [] for col in columns}
            #time.sleep(60*60)
            break
            
        #elif response.status_code != 200:
        #    time.sleep(60)
        #    continue
        
        else:
            i+=1
            if i%10==0:
                print(i)

            try:
                javascript_data = get_javascript_data(soup)

                class_data['class_name'].append(get_title_and_author(soup)[0])

                teacher, teacher_title, teacher_profile = get_detial_author_info(javascript_data)
                class_data['teacher'].append(teacher)
                class_data['teacher_title'].append(teacher_title)
                class_data['teacher_profile'].append(teacher_profile)

                description_length, description_image_number, description_link_number = get_class_description(soup)

                class_data['description_length'].append(description_length)
                class_data['description_image_number'].append(description_image_number)
                class_data['description_link_number'].append(description_link_number)

                video_number, video_length_min = get_video_length(soup)
                class_data['video_number'].append(video_number)
                class_data['video_length_min'].append(video_length_min)

                class_data['tags'].append(get_tags(soup))
                class_data['sample_project'].append(teacher in get_project_authors(javascript_data))
                class_data['class_sku'].append(get_class_sku(javascript_data))
                class_data['paid_class'].append(isPremium(javascript_data))
                class_data['start_date'].append(get_start_date(javascript_data))
                class_data['enrollment_number'].append(get_enrollment_number(javascript_data))

            except:
                continue

    pickle_title = class_type + ".pickle"
    pickle_out = open(pickle_title,"wb")
    pickle.dump(class_data, pickle_out)
    pickle_out.close()
    #return 0

#type(p)
#with Pool(3) as p:
#    p.map(parse, class_types[5:])

working on  web-development


KeyboardInterrupt: 

In [None]:
from urllib.request import urlopen


In [None]:
http_proxy  = "http://10.10.1.10:3128"
https_proxy = "https://10.10.1.11:1080"
ftp_proxy   = "ftp://10.10.1.10:3128"

proxyDict = { 
              "http"  : http_proxy, 
              "https" : https_proxy, 
              "ftp"   : ftp_proxy
            }


In [None]:
pickle_title = class_type + ".pickle"
pickle_out = open(pickle_title,"wb")
pickle.dump(class_data, pickle_out)
pickle_out.close()

In [None]:
for test_case in test_cases:
    response = requests.get(test_case)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    javascript_data = get_javascript_data(soup)
    
    print(get_title_and_author(soup))
    print(get_detial_author_info(javascript_data))
    print(get_class_description(soup))    
    print(get_video_length(soup))
    print(get_tags(soup))
    print(get_project_authors(javascript_data))
    print(get_class_sku(javascript_data))
    print(isPremium(javascript_data))
    print(javascript_data['pageData']['headerData']['tagText'] == 'Premium class')
    print(get_start_date(javascript_data))
    print(get_enrollment_number(javascript_data))

In [None]:
get_class_link(class_types, urls_dict)

with open('class_urls.csv', 'w') as f:
    writer = csv.writer(f)
    for key, val in class_links_dict.items():
        writer.writerow([key, val])

In [None]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

class_links_dict = {}