# edu-cater 

In [None]:
# imports

from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import os

import pandas as pd
import numpy as np
import pickle
import time
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer # ???

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis
from pyLDAvis import gensim as pyldagensim

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# Scrape course URLs

level_dict = {'AllIntAdv': 'https://www.coursera.org/search?query=%22%22&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Intermediate&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B1%5D=Advanced&indices%5Bprod_all_products%5D%5Bpage%5D=1&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true',
                'AllMixed': 'https://www.coursera.org/search?query=%22%22&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Mixed&indices%5Bprod_all_products%5D%5Bpage%5D=1&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true',
                'AllBeg': 'https://www.coursera.org/search?query=%22%22&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Beginner&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Bskills%5D=&indices%5Bprod_all_products%5D%5Bpage%5D=1&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true'}


# get all course URLs for a difficulty level and save to text file

level_names = ['AllBeg', 'AllIntAdv', 'AllMixed']

for level_name in level_names:
    url = level_dict[level_name]

    driver = webdriver.Chrome("/mnt/c/Users/easso/docs/neurohackademy/insight_examples/chromedriver.exe")
    driver.get(url)
    urls_all = []
    while True:
        try:
            courses = driver.find_elements_by_xpath("//li[@class='ais-InfiniteHits-item']//a")
            urls_page = [course.get_attribute("href") for course in courses if "/learn/" in course.get_attribute("href")]
            urls_all.extend(urls_page)
            button = driver.find_element_by_xpath("//button[@id='pagination_right_arrow_button' and @class='label-text box arrow']")
            button.click()
            time.sleep(2)
        except:
            print("Reached end of course list")
            break

    with open(level_name + "_urls.txt", "w") as file:
        for link in urls_all:
            file.write(link + "\n")

# combine into one file
os.system('cat AllBeg_urls.txt AllIntAdv_urls.txt AllMixed_urls.txt > AllLevels_urls.txt')

In [None]:
# get course info

urls_all = list()
with open('AllLevels_urls.txt') as f:
    for line in f:
        urls_all.append(line.split('\n')[0])

course_info = {}

for url in urls_all:

    r  = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data)

    # get course title, description, syllabus headings, syllabus descriptions and add to dictionary
    title = soup.find(class_="H2_1pmnvep-o_O-weightNormal_s9jwp5-o_O-fontHeadline_1uu0gyz max-text-width-xl m-b-1s").text
    description = soup.find_all(class_='AboutCourse')[0].find(class_="content-inner").text
    syllabus_headings_all = soup.find_all(class_='H2_1pmnvep-o_O-weightBold_uvlhiv-o_O-bold_1byw3y2 m-b-2')
    syllabus_headings = ""
    for heading in syllabus_headings_all:
        syllabus_headings += heading.text + " "
    try:
        syllabus_descriptions_all = soup.find_all(class_='Syllabus')[0].find_all(class_="content-inner")
        syllabus_descriptions = ""
        for desc in syllabus_descriptions_all:
            syllabus_descriptions += desc.text + " "
    except:
        syllabus_descriptions = ""

    course_info[i] = {'title': title,
                      'description': description, 
                      'syllabus_headings': syllabus_headings, 
                      'syllabus_descriptions': syllabus_descriptions}
    
# save course info to dictionary
file = open('Coursera_allinfo.pkl', 'wb')
pickle.dump(course_info, file)
file.close()


In [None]:
# get number of reviews for each course

nreviews_all = []
for i, url in enumerate(urls_all):
    r  = requests.get(url + '/reviews')
    data = r.text
    soup = BeautifulSoup(data)

    try:
        nreviews_tmp = soup.find_all(class_="H2_1pmnvep-o_O-weightNormal_s9jwp5-o_O-fontHeadline_1uu0gyz m-y-2 text-secondary")[0].text
        nreviews = int(nreviews_tmp.split('Reviews for')[0].split('of ')[1][:-1].replace(',',''))
    except:
        nreviews = 0
        
    nreviews_all.append(nreviews)
    
file = open('nreviews_all.pkl','wb')
pickle.dump(nreviews_all, file)
file.close

with open("nreviews_all.txt", "w") as file:
    for review in nreviews_all:
        file.write(str(review) + "\n")