# edu-cater 

In [2]:
# imports

from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import os
import json
import sqlite3

import pandas as pd
import numpy as np
import pickle
import time
import seaborn as sns
from scipy.io import savemat, loadmat
from matplotlib import pyplot as plt
%matplotlib inline

import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer # ???

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import pyLDAvis
from pyLDAvis import gensim as pyldagensim

import networkx as nx
from networkx.algorithms import shortest_path

from warnings import filterwarnings
filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /home/amandae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
class course_scraper():

    def __init__(self): #thisworks
        
        self.level_dict = {'AllIntAdv': 'https://www.coursera.org/search?query=%22%22&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Intermediate&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B1%5D=Advanced&indices%5Bprod_all_products%5D%5Bpage%5D=1&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true',
                           'AllMixed': 'https://www.coursera.org/search?query=%22%22&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Mixed&indices%5Bprod_all_products%5D%5Bpage%5D=1&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true',
                           'AllBeg': 'https://www.coursera.org/search?query=%22%22&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Beginner&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Bskills%5D=&indices%5Bprod_all_products%5D%5Bpage%5D=1&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true'}
        
        self.level_names = ['AllBeg', 'AllIntAdv', 'AllMixed']

    def scrape_urls(self): #thisworks
        urls_all = []
        driver = webdriver.Chrome("/mnt/c/Users/easso/docs/neurohackademy/insight_examples/chromedriver.exe")
        for level_name in self.level_names:
            print('Scraping', level_name, 'urls')
            url = self.level_dict[level_name]
            driver.get(url)
            while True:
                try:
                    courses = driver.find_elements_by_xpath("//li[@class='ais-InfiniteHits-item']//a")
                    urls_page = [course.get_attribute("href") for course in courses if "/learn/" in course.get_attribute("href")]
                    urls_all.extend(urls_page)
                    button = driver.find_element_by_xpath("//button[@id='pagination_right_arrow_button' and @class='label-text box arrow']")
                    button.click()
                    time.sleep(2)
                except Exception as e:
                    print("Reached end of", level_name, "course list")
                    break

        with open("edu-cater_urls.txt", "w") as file:
            for link in urls_all:
                file.write(link + "\n")
                
        file = open('edu-cater_urls.pkl', 'wb')
        pickle.dump(urls_all, file)
        file.close()
    
    def load_urls(self):
        file = open('edu-cater_urls.pkl', 'rb')
        self.urls_all = pickle.load(file)
        file.close()
    
    def scrape_courses(self):
        file = open('edu-cater_urls.pkl', 'rb')
        self.urls_all = pickle.load(file)
        file.close()
        
        # get course info
        course_info_all = {}

        for i, url in enumerate(self.urls_all):
            print(url)
            r  = requests.get(url)
            data = r.text
            soup = BeautifulSoup(data)

            ### get course info and add to dictionary
            # course title
            title = soup.find(class_="H2_1pmnvep-o_O-weightNormal_s9jwp5-o_O-fontHeadline_1uu0gyz max-text-width-xl m-b-1s").text
            # course description
            description = soup.find_all(class_='AboutCourse')[0].find(class_="content-inner").text
            # syllabus headings
            syllabus_headings_all = soup.find_all(class_='H2_1pmnvep-o_O-weightBold_uvlhiv-o_O-bold_1byw3y2 m-b-2')
            syllabus_headings = ""
            for heading in syllabus_headings_all:
                syllabus_headings += heading.text + " "
            # syllabus descriptions
            try:
                syllabus_descriptions_all = soup.find_all(class_='Syllabus')[0].find_all(class_="content-inner")
                syllabus_descriptions = ""
                for desc in syllabus_descriptions_all:
                    syllabus_descriptions += desc.text + " "
            except:
                syllabus_descriptions = ""
            # number of reviews
            try:
                nreviews = int(soup.find(itemprop="reviewCount").text)
            except:
                nreviews = np.nan
            # level
            try:
                level = soup.find('title', id=re.compile('Level')).text.split()[0]
            except:
                level = "Mixed"
            # hours (course length)
            try:
                hours_tmp = soup.find_all(text=re.compile("Approx. "))[0]
                hours = int(hours_tmp.split('Approx. ')[1].split(' hours')[0])
            except:
                hours = np.nan
            # stars (overall rating)
            try:
                stars = soup.find_all(class_="H4_1k76nzj-o_O-weightBold_uvlhiv-o_O-bold_1byw3y2 m-l-1s m-r-1 m-b-0")
                stars = float(stars[0].text)
            except:
                stars = np.nan
            # enrollment
            enrollment_tmp = soup.find('script', text = re.compile('totalEnrollment')).text
            enrollment = int(enrollment_tmp.split('"totalEnrollmentCount":')[1].split('}')[0])
            # skills you'll gain
            skills = []
            try:
                soup.find_all(class_="Box_120drhm-o_O-displayflex_poyjc-o_O-wrap_rmgg7w")[0].text
                skills_tags = soup.find_all(class_="centerContent_dqfu5r")
                for skill in skills_tags:
                    skills.append(skill.text)
            except: 
                pass
            # occupations (Learners taking this course are...)
            occupations_tags = soup.find_all(class_="occupation-name")
            occupations = []
            for occupation in occupations_tags:
                occupations.extend(occupation)
            # reviews
            print("number of reviews:", nreviews)
            reviews = []
            counter = 1
            get_reviews = 0
            if get_reviews==1:
                while True:
                    try:
                        if counter%10==1:
                            print(counter)
                        if counter == 1:
                            r = requests.get(url+'/reviews')
                        else:
                            r = requests.get(url+'/reviews'+'?page='+str(counter))
                        data = r.text
                        soup = BeautifulSoup(data)
                        reviews_all = soup.find_all(class_="reviewText")
                        if len(reviews_all)==0:
                            break
                        else:
                            for review in reviews_all:
                                reviews.append(review.text)
                            counter += 1
                    except:
                        pass

            # add info to dictionary
            course_info  =   {'title': title,
                              'description': description, 
                              'syllabus_headings': syllabus_headings,
                              'syllabus_descriptions': syllabus_descriptions,
                              'nreviews': nreviews,
                              'level': level,
                              'hours': hours,
                              'stars': stars,
                              'enrollment': enrollment,
                              'skills': skills,
                              'occupations': occupations,
                              'reviews': reviews}
            
            # save course_info
            file = 'course_info/course' + str(i) + '.json'
            with open(file, 'w') as fp:
                json.dump(course_info, fp)
                
        
        
    def scrape_course_network(self):
        file = open('edu-cater_urls.pkl', 'rb')
        self.urls_all = pickle.load(file)
        file.close()
        
        # make course network
        course_network = np.zeros((len(self.urls_all), len(self.urls_all)))

        driver = webdriver.Chrome("/mnt/c/Users/easso/docs/neurohackademy/insight_examples/chromedriver.exe")        
        #for i, url in enumerate(self.urls_all):
        for i in range(1302, len(self.urls_all)): #testcode
            if i%100 == 0:
                print(i) #testcode
            url = self.urls_all[i] #testcode
            print(url)
            driver.get(url)
            
            recs_all = []
            recs = driver.find_elements_by_xpath('//div[@class="m-a-1s"]//div//a[@data-click-value]')
            for rec in recs:
                recs_all.append(rec.get_attribute("href"))

            time.sleep(2)

            while True:
                try:
                    button = driver.find_element_by_xpath("//button[@class='Button_1w8tm98-o_O-icon_1rbfoc-o_O-md_1jvotax']")
                except:
                    break
                button.click()
                time.sleep(2)
                recs = driver.find_elements_by_xpath('//div[@class="m-a-1s"]//div//a[@data-click-value]')
                repeats=0
                for rec in recs:
                    if rec.get_attribute("href") in recs_all:
                        repeats += 1
                    else:
                        recs_all.append(rec.get_attribute("href"))
                if repeats == len(recs):
                    break

            recs_all = np.unique(recs_all)
            recs_all_courses = []
            for rec in recs_all:
                if '/learn/' in rec:
                    recs_all_courses.append(rec)

            recs_all_courses

            for rec in recs_all_courses:
                try:
                    ind = scraper.urls_all.index(rec)
                    course_network[i,ind] = 1
                except:
                    pass
        
            coursenet = course_network[i,:]
            savename = 'course_nets/course' + str(i) + '.mat'
            savemat(savename,{'coursenet': coursenet})
        return course_network
        

In [None]:
# pipeline
scraper = course_scraper() #thisworks
#scraper.scrape_urls()      #thisworks
#scraper.scrape_courses()
scraper.load_urls()
net = scraper.scrape_course_network()

https://www.coursera.org/learn/musicianship-chords
https://www.coursera.org/learn/cloud-applications-part1
https://www.coursera.org/learn/data-collection-framework
https://www.coursera.org/learn/conflict-resolution-mediation
https://www.coursera.org/learn/federal-taxation-business
https://www.coursera.org/learn/branding-and-cx
https://www.coursera.org/learn/technical-writing
https://www.coursera.org/learn/academic-literacy
https://www.coursera.org/learn/international-negotiation
https://www.coursera.org/learn/business-strategies
https://www.coursera.org/learn/onprem-fundamentals-apigee-gcp
https://www.coursera.org/learn/six-sigma-define-measure-advanced
https://www.coursera.org/learn/javascript-jquery-json
https://www.coursera.org/learn/robotics-perception
https://www.coursera.org/learn/pathophysiology
https://www.coursera.org/learn/advanced-r
https://www.coursera.org/learn/agile-software-development
https://www.coursera.org/learn/cataract-surgery
https://www.coursera.org/learn/geometr

# Latent Dirichlet Allocation (LDA)

In [None]:
# define preprocessing functions: tokenization, stemming, lemmatization 
# https://github.com/priya-dwivedi/Deep-Learning/blob/master/topic_modeling/LDA_Newsgroup.ipynb

def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token!='youâ':
            result.append(lemmatize_stemming(token))
            
    return result

In [None]:
# aggregate all text info (description, syllabus) and preprocess

processed_info = []
nreviews = []
stars = []
hours = []
levels = []
enrollment = []
course_info_all = []
titles_all = []
for i in range(2635): #fixthis
    if i%100 == 0:
        print(i)
    with open('course_info_short/course' + str(i) + '.json') as json_file:
        course_info = json.load(json_file)
    allinfo = course_info['title'] + ' ' + course_info['description'] \
            + ' ' + course_info['syllabus_headings'] + ' ' + course_info['syllabus_descriptions'] 
    for skill in course_info['skills']:
        allinfo = allinfo + ' ' + skill
    for occupation in course_info['occupations']:
        allinfo = allinfo + ' ' + occupation
    for review in course_info['reviews']:
        allinfo = allinfo + ' ' + review
    nreviews.append(course_info['nreviews'])
    stars.append(course_info['stars'])
    hours.append(course_info['hours'])
    levels.append(course_info['level'])
    enrollment.append(course_info['enrollment'])
    titles_all.append(course_info['title'])
   
    processed_info.append(preprocess(allinfo))
    course_info_all.append(allinfo)
    
# save course titles for web app
file = open('simple_app/course_titles.pkl', 'wb')
pickle.dump(titles_all, file)
file.close()


In [None]:
# EDA for quantitative info

fig = plt.figure(figsize=(15,4))

fig.add_subplot(131)
plt.hist(stars,20); plt.title('Rating')
fig.add_subplot(132)
plt.hist(hours,20); plt.title('Length (hours)')
fig.add_subplot(133)
plt.hist(np.log10(enrollment),20); plt.title('Enrollment (log10)')
plt.savefig('ed1.png')
plt.show()

In [None]:
# make dictionary
dictionary = gensim.corpora.Dictionary(processed_info)
print('Length of originial dictionary:', len(dictionary))

# remove rare and common words
dictionary.filter_extremes(no_below=10, no_above=.25, keep_n=100000)
print('Length of filtered dictionary:', len(dictionary))

# make bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_info]

In [None]:
# run LDA model and print topics
ntopics = 20
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = ntopics, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

In [None]:
# distribution of best-matching topics

bestmatch = np.argmax(scoremat,axis=1)
plt.hist(bestmatch,20)
my_x_labels = []
for x in range(20):
    my_x_labels.append('Topic ' + str(x+1))
plt.xticks(np.arange(20),my_x_labels, rotation=90)
plt.gcf().subplots_adjust(bottom=0.20)
plt.title('Topic Distribution')
plt.savefig('topicdist.png')
plt.show()

In [None]:
# TODO: unseen document


In [None]:
# interactive visualization of topics with pyLDAvis
pyLDAvis.enable_notebook()
data = pyldagensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(data,'courseviz.html')

In [None]:
# similarity b/w topic scores 

from sklearn.metrics.pairwise import cosine_similarity as cos_sim

def doctopics(ind): 
    unseen_document = course_info_all[ind]
    bow_vector = dictionary.doc2bow(preprocess(unseen_document))
    topic_tuples = lda_model.get_document_topics(bow_vector, 0, 0, True)[0]
    topic_scores = np.zeros((1, len(topic_tuples)))
    for i, score in enumerate(topic_tuples):
        topic_scores[0,i] = score[1]
    return topic_scores
    
scoremat = np.zeros((len(course_info_all),ntopics))
savemat('simple_app/scoremat.mat',{'scoremat': scoremat})

for i in range(len(course_info_all)):
    scoremat[i,:] = doctopics(i)

thresh = .7
scorecorrs = cos_sim(scoremat)
scorecorrs[scorecorrs<=thresh] = 0
scorecorrs[scorecorrs>thresh] = 1

fig = plt.figure(figsize=(6,6))
plt.imshow(scorecorrs, cmap='magma')
plt.colorbar()
plt.show()

In [None]:
# graph theory with networkx
G = nx.from_numpy_matrix(scorecorrs)
weights = dict(G.degree(weight='weight'))
values = [weights.get(node, 0.25) for node in G.nodes()]

plt.figure(1,figsize=(12,12)) 
nx.draw(G, node_size=20, node_color = values, width=.1, cmap='plasma')
plt.savefig("cousera_lda_network.png", format="PNG")
plt.show(block=False)

file = open('simple_app/networkx_graph.pkl','wb')
pickle.dump(G, file)
file.close()

file = open('simple_app/networkx_values.pkl', 'wb')
pickle.dump(values, file)
file.close()