In [1]:
import numpy as np
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import time

In [2]:
search_term = '+'.join("natural language processing".split(' '))
sort_by = 'most-reviewed' # most-reviewed, relevance, highest-rated, newest

In [3]:
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", 
           "Accept-Encoding":"gzip, deflate", 
           "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
           "DNT":"1",
           "Connection":"close",
           "Upgrade-Insecure-Requests":"1"}

chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(executable_path='./webdriver/chromedriver', options=chrome_options)

In [5]:
URL = "https://www.udemy.com/courses/search/?q="+search_term+'&sort='+sort_by
driver.get(URL)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, "html5lib")

In [6]:
t20_courses = pd.DataFrame()

for course in soup.find_all(attrs={"class":"popper--popper--19faV"})[4:]:
    link = 'https://www.udemy.com'+course.find('a', attrs={'class':'udlite-custom-focus-visible'})['href']
    title = course.find('div', attrs={'class':'course-card--course-title--2f7tE'}).text
    description = course.find('p', attrs={'class':'course-card--course-headline--yIrRk'}).text
    instructor = course.find('div', attrs={'class':'course-card--instructor-list--lIA4f'}).text
    price = course.find('div', attrs={'data-purpose':'course-price-text'}).text.replace('Current price','')
    rating = course.find('span', attrs={'data-purpose':'rating-number'}).text
    no_reviews = course.find('span', attrs={'class':'course-card--reviews-text--12UpL'}).text.translate({ord(c):None for c in [',','(',')']})
    length = course.find('div', attrs={'class':'course-card--course-meta-info--1hHb3'}).find_all('span')[0].text
    no_lectures = course.find('div', attrs={'class':'course-card--course-meta-info--1hHb3'}).find_all('span')[1].text
    difficulty = course.find('div', attrs={'class':'course-card--course-meta-info--1hHb3'}).find_all('span')[2].text
    
    t20_courses = t20_courses.append({
        'link':link,
        'title':title,
        'description':description,
        'instructor':instructor,
        'price':price,
        'rating':rating,
        'no_reviews':no_reviews,
        'length':length,
        'no_lectures':no_lectures,
        'difficulty':difficulty
    }, ignore_index=True)

In [8]:
for row, course in t20_courses.iterrows():
    driver.get(course.link)
    soup = BeautifulSoup(driver.page_source, "html5lib")
    curriculum = {}
    for topic in soup.find_all('div', attrs={'class':'section--panel--1tqxC'}):
        topic_title = topic.find('span', attrs={'class':'section--section-title--8blTh'}).text
        subtopics = [subtopic.find('div', attrs={'class':'udlite-block-list-item-content'}).find('span').text\
                     for subtopic in topic.find('ul', attrs={'class':'unstyled-list'}).find_all('li')]
        curriculum[topic_title] = subtopics
    t20_courses.loc[row, 'curriculum']=str(curriculum)
    print("Curriculum downloaded for course: {}".format(course.title))
    time.sleep(5)

Curriculum downloaded for course: Data Science: Natural Language Processing (NLP) in Python
Curriculum downloaded for course: NLP - Natural Language Processing with Python
Curriculum downloaded for course: Natural Language Processing with Deep Learning in Python
Curriculum downloaded for course: Deep Learning and NLP A-Z™: How to create a ChatBot
Curriculum downloaded for course: Deep Learning: Convolutional Neural Networks in Python
Curriculum downloaded for course: Deep Learning: Advanced NLP and RNNs
Curriculum downloaded for course: Deep Learning: Recurrent Neural Networks in Python
Curriculum downloaded for course: Introduction to Natural Language Processing (NLP)
Curriculum downloaded for course: Modern Natural Language Processing in Python
Curriculum downloaded for course: Hands On Natural Language Processing (NLP) using Python
Curriculum downloaded for course: U&P AI - Natural Language Processing (NLP) with Python
Curriculum downloaded for course: Learn Data Science Deep Learni

In [15]:
for row, course in t20_courses.iterrows():
    with open('curriculums/'+str(row+1)+'. '+course.link.replace('https://www.udemy.com/course/','').strip('/')+'.txt', 'w') as f:
        curriculum = eval(course.curriculum)
        for tindex, topic in enumerate(curriculum.keys()):
            f.write("{}. {}\n".format(tindex+1,topic))
            for sindex, subtopic in enumerate(curriculum[topic]):
                f.write('\t{}.{} {}\n'.format(tindex+1,sindex+1,subtopic))

In [10]:
driver.quit()

In [16]:
t20_courses.to_csv("t20_courses.csv", index=False)

In [203]:
from rapidfuzz import fuzz
import itertools
pd.set_option('display.max_rows', None)

In [28]:
topics = set()
subtopics = set()

for row, course in t20_courses.iterrows():
    curriculum = eval(course.curriculum)
    topics = topics.union(set(itertools.chain(curriculum.keys())))
    subtopics = subtopics.union(set(itertools.chain(*curriculum.values())))

In [65]:
import re

def remove_noise(text):
    text = re.sub(r'[^a-zA-Z ]+', '', text)
    text = re.sub(r' +', ' ', text)
    return text.strip(' \n\t').lower()

In [66]:
topics_cl = pd.Series(list(topics)).apply(remove_noise)
subtopics_cl = pd.Series(list(subtopics)).apply(remove_n`boise)

In [255]:
topics_cl.sort_values()

137                                 advanced nltk topics
9                       annex artificial neural networks
93                                              appendix
51                                             attention
19                 basics of natural language processing
73            beginners corner working with word vectors
55                                    bidirectional rnns
105                                       bonus material
56                               bonus section thank you
36                     build your own sentiment analyzer
4                           build your own spam detector
126                     building a chatbot with deep nlp
138             categorizing and tagging words with nltk
116         cnn for nlp application sentimental analysis
53                                 cnn for nlp intuition
13                                  collecting text data
103             complete text cleaning and preprocessing
152                            

## Using TF-IDF

In [242]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [230]:
vectorizer = TfidfVectorizer(ngram_range=(1, 5))
X = vectorizer.fit_transform(topics_cl)

In [241]:
tfidf_matrix = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names(), index=topics_cl)

In [246]:
csim_matrix = pd.DataFrame(cosine_similarity(tfidf_matrix,tfidf_matrix), columns=topics_cl, index=topics_cl)

In [252]:
n=15
print(csim_matrix.columns[n])
csim_matrix.loc[csim_matrix.columns[n]].sort_values(ascending=False)

nltk and the basics


nltk and the basics                                                             1.000000
nltk exploration                                                                0.131584
keras and tensorflow basics                                                     0.096983
deep learning basics                                                            0.089797
advanced nltk topics                                                            0.089562
python text basics                                                              0.089163
theano and tensorflow basics review                                             0.078036
natural language processing basics                                              0.071523
categorizing and tagging words with nltk                                        0.071435
machine learning basics review                                                  0.069473
introduction to the course the key concepts and software tools                  0.062118
processing raw text w

## Checking using process

In [218]:
from rapidfuzz import process

process.extract("basics of natural language processing ", topics_cl, scorer=fuzz.WRatio, limit=10)

[('basics of natural language processing', 100.0, 19),
 ('natural language processing basics', 95.0, 155),
 ('natural language processing nlp', 88.44827586206895, 113),
 ('nltk and the basics', 85.5, 15),
 ('metrics for language', 85.5, 27),
 ('language classification', 85.5, 66),
 ('deep learning basics', 85.5, 81),
 ('getting started with nltk natural language processing toolkit', 85.5, 89),
 ('find and represent the meaning or topic of natural language text',
  85.5,
  107),
 ('python text basics', 85.5, 133)]

## Checking using fuzzy string matching matrix

In [219]:
matrix = pd.crosstab(topics_cl, topics_cl)
matrix = matrix.apply(lambda col: [fuzz.WRatio(col.name, x) for x in col.index])

In [222]:
n=2
print(matrix.columns[n])
matrix.loc[matrix.columns[n]].sort_values(ascending=False)

appendix


col_0
appendix                                                                        100.000000
attention                                                                        47.058824
outline review and logistical things                                             45.000000
transformer application                                                          45.000000
project predict the stock news headlines                                         45.000000
environment setup and installation                                               45.000000
project predict the sentiments of amazon customer                                45.000000
getting an idea of nlp and its applications                                      45.000000
getting started with nltk natural language processing toolkit                    45.000000
handling categorical data                                                        45.000000
handling missing values                                                          45.

## Checking using clustering

In [223]:
from sklearn.cluster import AffinityPropagation

af = AffinityPropagation(random_state=2021).fit(matrix)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

In [224]:
matrix[labels==1]

col_0,advanced nltk topics,annex artificial neural networks,appendix,attention,basics of natural language processing,beginners corner working with word vectors,bidirectional rnns,bonus material,bonus section thank you,build your own sentiment analyzer,...,webscraping extract data from webpages,welcome,welcome to the course,word embeddings,word embeddings and wordvec,word embeddings using glove,wordvec analysis,wordvec in detail and what is going on under the hood,working with text files,write your own article spinner
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cnn for nlp intuition,34.146341,41.509434,33.75,52.941176,38.571429,38.571429,38.974359,32.571429,38.863636,42.857143,...,30.0,15.545455,40.714286,27.777778,29.166667,33.333333,32.432432,40.714286,36.363636,39.215686
deep learning for nlp,34.146341,38.571429,33.75,40.0,34.285714,38.571429,35.897436,36.642857,30.227273,36.642857,...,34.285714,38.571429,33.333333,33.333333,37.5,37.5,32.432432,38.571429,34.545455,35.294118
effective learning strategies for machine learning faq by student request,33.75,39.375,22.5,30.4,36.486486,40.0,40.0,38.571429,37.173913,38.181818,...,40.263158,25.714286,38.571429,36.0,33.333333,33.333333,33.75,39.68254,43.043478,39.9
preprocessing for nlp,32.439024,31.090909,33.75,34.2,58.235294,42.857143,41.025641,30.535714,31.818182,34.285714,...,36.642857,38.571429,31.666667,33.333333,37.5,41.666667,32.432432,34.285714,34.545455,39.215686


In [225]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=25, init='k-means++', max_iter=500, n_init=1)
kmeans.fit(matrix)
predicted = kmeans.predict(matrix)

In [226]:
matrix[predicted==21]

col_0,advanced nltk topics,annex artificial neural networks,appendix,attention,basics of natural language processing,beginners corner working with word vectors,bidirectional rnns,bonus material,bonus section thank you,build your own sentiment analyzer,...,webscraping extract data from webpages,welcome,welcome to the course,word embeddings,word embeddings and wordvec,word embeddings using glove,wordvec analysis,wordvec in detail and what is going on under the hood,working with text files,write your own article spinner
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
keras and tensorflow basics,46.808511,45.084746,42.75,47.5,47.5,34.782609,33.25,38.571429,30.4,38.0,...,40.0,25.714286,35.625,38.571429,42.222222,33.333333,42.75,85.5,32.0,42.105263
machine learning and neurons,39.583333,50.0,33.75,40.0,40.923077,42.857143,40.0,32.142857,39.215686,40.491803,...,36.363636,38.571429,36.734694,30.0,40.0,32.727273,32.0625,85.5,31.372549,39.310345
nlp and transformers,47.5,42.631579,33.75,40.0,40.5,36.0,35.0,35.294118,37.209302,39.375,...,40.5,38.571429,34.146341,28.571429,40.425532,28.297872,36.944444,85.5,32.55814,34.054054
nltk and the basics,58.461538,40.5,42.75,32.0625,85.5,36.0,30.810811,36.363636,36.190476,36.0,...,36.0,24.428571,45.0,35.294118,41.304348,26.086957,43.428571,85.5,42.857143,40.0
numpy and pandas,42.222222,33.75,45.0,30.0,39.375,24.137931,29.411765,33.333333,35.897436,39.375,...,33.75,12.857143,21.621622,25.806452,85.5,22.5,37.5,85.5,20.512821,33.75
question and answering,40.714286,38.703704,33.75,50.0,44.067797,40.909091,45.0,38.888889,44.444444,46.153846,...,36.666667,27.692308,32.55814,37.837838,40.816327,34.897959,36.842105,85.5,29.555556,34.615385


In [None]:
import scipy
import scipy.cluster.hierarchy as sch

def cluster_corr(corr_array, inplace=False):
    pairwise_distances = sch.distance.pdist(corr_array)
    linkage = sch.linkage(pairwise_distances, method='complete')
    cluster_distance_threshold = pairwise_distances.max()/2
    idx_to_cluster_array = sch.fcluster(linkage, cluster_distance_threshold, 
                                        criterion='distance')
    idx = np.argsort(idx_to_cluster_array)
    
    if not inplace:
        corr_array = corr_array.copy()
    
    if isinstance(corr_array, pd.DataFrame):
        return corr_array.iloc[idx, :].T.iloc[idx, :]
    return corr_array[idx, :][:, idx]