In [410]:
# Retrieve the Udemy API call credentials and Setup the HTTPS calller
# import package os to retrieve the secured API credentials from the environment variables
# import the installed package pyudemy

import os
from pyudemy import Udemy

UDEMY_API_CLIENT_ID = os.environ.get("UDEMY_API_CLIENT_ID")
UDEMY_API_CLIENT_SECRET = os.environ.get("UDEMY_API_CLIENT_SECRET")

udemy = Udemy(UDEMY_API_CLIENT_ID, UDEMY_API_CLIENT_SECRET)

In [411]:
# Testing to inspect the udemy api call result
# Returns sample list of courses.
test_result = udemy.courses(page = 1, page_size=2)

print(test_result.keys())
print("\n")
print(test_result["results"][0].keys())
print("\n")

dict_keys(['count', 'next', 'previous', 'results', 'aggregations'])


dict_keys(['_class', 'id', 'title', 'url', 'is_paid', 'price', 'price_detail', 'price_serve_tracking_id', 'visible_instructors', 'image_125_H', 'image_240x135', 'is_practice_test_course', 'image_480x270', 'published_title', 'tracking_id', 'locale', 'predictive_score', 'relevancy_score', 'input_features', 'lecture_search_result', 'curriculum_lectures', 'order_in_results', 'curriculum_items', 'headline', 'instructor_name'])




In [412]:
# Set the parameters required to call the API
# 'Object' respresents the type of item, 'Setting' and 'Additions' describes the fields to return and 'Minus' identifies the fields to exclude

paramters = [
    { 
        "Object": "course",
        "Setting": "title",
        "Additions": ["owner", "avg_rating", "headline", "url"],
        "Minus": ["images", "curriculum_items"]
    }
]

#pagination start page number
start_page = 1

#pagination items to return
total_pages_to_load = 51

# maximum page size is 100
total_page_size = 100

# a list that will store all returned items
courses = []

# because the maximum items the api can return on a call is 100, this loop handles recalling the endpoint to get more courses up to 5000
for page_number in range(start_page, total_pages_to_load):
    result = udemy.courses(fields= paramters, page = page_number, page_size=total_page_size)
    courses.extend(result["results"])


#verify the size of the list after all the api calls are done
len(courses)
 

5000

In [414]:
# import package pandas for the purpose of reading csv and creating DataFrames 
import pandas as pd

# Check if courses were not successfully loaded via the api calls, 
# then default to the already downloaded copy of the dataset from the official website into DataFrame.
if len(courses) == 0:
    df3 = pd.read_csv("/Users/ayo/Downloads/udemy_courses_dataset.csv", index=False, encoding = 'utf-8')

# Else loop through and pick only the required fields relevant to the recommender model and store in a DataFrame
else:
    records = []
    rec = {}
    for e in courses:
        records.append({key: e[key] for key in {'_class', 'id', 'title', 'headline', 'url', 'avg_rating'}})
    
    df3 = pd.DataFrame(records)

# Clean the dataset by dropping any N/A values
df3.dropna(inplace=True)

# Display a sample of 4 items from the DataFrame
df3.head(4)


Unnamed: 0,_class,id,headline,title,url,avg_rating
0,course,473160,Learn web design in 1 hour with 25+ simple-to-...,Web Design for Web Developers: Build Beautiful...,/course/web-design-secrets/,4.462351
1,course,433798,A Quick and Easy Intro to Python Programming,Introduction To Python Programming,/course/pythonforbeginnersintro/,4.399265
2,course,5340996,Prepare for your technical interview by going ...,Python Interview Preparation Coding Exercises,/course/python-interview-preparation-coding-ex...,3.596154
3,course,53600,Learn the basics of Microsoft Excel and become...,Useful Excel for Beginners,/course/useful-excel-for-beginners/,4.502582


In [415]:
# Show a statistical description of the courses DataFrame
df3.describe()

Unnamed: 0,id,avg_rating
count,5000.0,5000.0
mean,3321161.0,4.230427
std,1731873.0,0.781919
min,9061.0,0.0
25%,1712098.0,4.129365
50%,3738120.0,4.384615
75%,4972418.0,4.566667
max,5533080.0,5.0


In [417]:
# Create a new custom column 'title_headline' into the DataFrame 
# by concatenating the title and headline to increase the words frequency weight

df3['title_headline'] = df3['title'] + " " + df3['headline']

df3.head(4)

Unnamed: 0,_class,id,headline,title,url,avg_rating,title_headline
0,course,473160,Learn web design in 1 hour with 25+ simple-to-...,Web Design for Web Developers: Build Beautiful...,/course/web-design-secrets/,4.462351,Web Design for Web Developers: Build Beautiful...
1,course,433798,A Quick and Easy Intro to Python Programming,Introduction To Python Programming,/course/pythonforbeginnersintro/,4.399265,Introduction To Python Programming A Quick and...
2,course,5340996,Prepare for your technical interview by going ...,Python Interview Preparation Coding Exercises,/course/python-interview-preparation-coding-ex...,3.596154,Python Interview Preparation Coding Exercises ...
3,course,53600,Learn the basics of Microsoft Excel and become...,Useful Excel for Beginners,/course/useful-excel-for-beginners/,4.502582,Useful Excel for Beginners Learn the basics of...


In [418]:
# A Function that creates a similarity matrix between the courses.

# import TfidfVector from sklearn.
# import linear_kernel from sklearn.metrics.pairwise which does same process as cosine_similarity but at a much faster speed

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


def create_cosine_similarity_matrix(courses):

        # Define a tfidf vectorizer and remove all stopwords.
        tfidf = TfidfVectorizer(stop_words="english")
    
        #Convert tfidf matrix by fitting and transforming the data.
        tfidf_matrix = tfidf.fit_transform(courses)
    
        # output the shape of the matrix.
        tfidf_matrix.shape
    
        # calculating the cosine similarity matrix.
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

        #return the resulting smililarity matrix
        return cosine_sim

In [470]:
# Function that takes in a school student registered course title as input and outputs most similar Udemy Courses
def get_course_recommendations(course, courses):

    # Construct a reverse map of indices and course titles
    indices = pd.Series(courses.index, index=courses['title_headline']).drop_duplicates()

    # Get the index of the course that matches the title
    idx = indices[course]

    # create the similarity matrix
    cosine_sim = create_cosine_similarity_matrix(courses['title_headline'])

    # Get the pairwsie similarity scores of all courses with that course
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the courses based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    #filter for only scores that have exact or partial martial to exclude those with no match
    res = list(filter(lambda x: x[1] > 0, sim_scores))
  
    # Get the scores of the 5 most similar course
    
    sim_scores = res[1:6]

    # Get the course indices
    course_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar courses
    return indices.iloc[course_indices]


In [479]:
# Function to test the course recommender model

def test_course_recommender_model(student_registered_course):

    # create a copy of the initial DataFrame containing the original dataset for courses
    df5 = df3

    # append the student registered course to the DataFrame to allow computaion of similarity courses against
    search = {'title_headline': student_registered_course, '_class': '', 'id':0, 'title': student_registered_course, 
              'headline': '', 'url': '', 'avg_rating': ''}
    df5 = df5._append(search, ignore_index=True) 

    # call the get_course_recommendations function that returns a list of recommended courses
    results = get_course_recommendations(student_registered_course, df5)

    # print the recommended courses to console
    print(results)
    
    # Get the recommended courses indices
    course_indices_result = [i for i in results]

    # Get the items of courses from the original course DataFrame
    similar_courses = df3.iloc[course_indices_result]

    # Sort the recommended courses using the average rating for each of the courses
    sorted_similar_courses = similar_courses.sort_values(by=['avg_rating'], ascending=False)

    # return the recommended and sorted courses
    return sorted_similar_courses


In [472]:
# Test the recommender model with a student that registered for 'Statistics' as a course 
registered_course = 'Statistics'
result = test_course_recommender_model(registered_course)
result


title_headline
Essential Statistics for Data Science Statistics for Beginners                                                                1027
Explore Basic Statistics With IBM SPSS STATISTICS start learning data analysis                                                2221
Statistics and data literacy for non-statisticians Learn the key terms and analysis methods in statistics                       29
Statistics with MATLAB Statistics with MATLAB (Please don't give rank to the lecture before all the lectures are uploaded)    3183
Statistics with MATLAB Statistics with MATLAB (Please don't give rank to the lecture before all the lectures are uploaded)    3218
dtype: int64


Unnamed: 0,_class,id,headline,title,url,avg_rating,title_headline
3183,course,2756630,Statistics with MATLAB (Please don't give rank...,Statistics with MATLAB,/course/statistics-with-matlab/,4.75,Statistics with MATLAB Statistics with MATLAB ...
3218,course,2756630,Statistics with MATLAB (Please don't give rank...,Statistics with MATLAB,/course/statistics-with-matlab/,4.75,Statistics with MATLAB Statistics with MATLAB ...
29,course,3212317,Learn the key terms and analysis methods in st...,Statistics and data literacy for non-statistic...,/course/statistics-literacy-for-non-statistici...,4.522728,Statistics and data literacy for non-statistic...
1027,course,4700258,Statistics for Beginners,Essential Statistics for Data Science,/course/essential-statistics-for-data-science/,4.241228,Essential Statistics for Data Science Statisti...
2221,course,3721794,start learning data analysis,Explore Basic Statistics With IBM SPSS STATISTICS,/course/spss-statistics/,4.0,Explore Basic Statistics With IBM SPSS STATIST...


In [473]:
# Test the recommender model with another student that registered for 'MSc Big Data Analytics' as a course 
registered_course = 'MSc Big Data Analytics'
result = test_course_recommender_model(registered_course)
result


title_headline
Big data Internship Program - Foundation A Complete Guide to Learn Big data and Hadoop from Scratch.    2525
Knowledge Exchange: Data Analytics in a Nutshell An overview of data analytics for beginners            4830
Introduction to Data Analytics Journey to Data Analyst                                                  1998
Introduction to Data Analytics Journey to Data Analyst                                                  2007
Big Data and Hadoop Essentials Essential Knowledge for everyone associated with Big Data & Hadoop        475
dtype: int64


Unnamed: 0,_class,id,headline,title,url,avg_rating,title_headline
1998,course,4729010,Journey to Data Analyst,Introduction to Data Analytics,/course/microsoft-data-analytics/,4.337209,Introduction to Data Analytics Journey to Data...
2007,course,4729010,Journey to Data Analyst,Introduction to Data Analytics,/course/microsoft-data-analytics/,4.337209,Introduction to Data Analytics Journey to Data...
2525,course,873604,A Complete Guide to Learn Big data and Hadoop ...,Big data Internship Program - Foundation,/course/big-data-internship-program-part-1-fou...,4.3,Big data Internship Program - Foundation A Com...
475,course,225796,Essential Knowledge for everyone associated wi...,Big Data and Hadoop Essentials,/course/big-data-and-hadoop-essentials-free-tu...,3.866667,Big Data and Hadoop Essentials Essential Knowl...
4830,course,2997522,An overview of data analytics for beginners,Knowledge Exchange: Data Analytics in a Nutshell,/course/knowledge-exchange-data-analytics-in-a...,3.0,Knowledge Exchange: Data Analytics in a Nutshe...


In [478]:
# A block that provides a Web API endpoint for the recommender model
# Install package jupyter_kernel_gateway with pip3 install jupyter_kernel_gateway
# Start the API server with command run in terminal 

'''jupyter kernelgateway --api='kernel_gateway.notebook_http' 
  --seed_uri='/Users/ayo/ayotunde/Trainings/Python/python_training/dissertation/courses_recommender_model.ipynb' --port 8891
'''

# GET /recommender_model/:registered_course

import json
import requests

# an object representing the api request
REQUEST = json.dumps({
'path' : {},
'args' : {}
})

request = json.loads(REQUEST)

# Get the registered course from the query string
registered_course_param = request['path'].get('registered_course')

# pass the registered course
result = test_course_recommender_model(registered_course_param)

# return the recommdended courses to the user
print(json.dumps({
    'result': result.to_json(orient='records')
}))