In [1]:
#check python version
!python --version

Python 3.9.12


### Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import time

### Read In Data File and Clean it

In [4]:
#read in udemy_df
udemy_df = pd.read_csv('udemy_master_df.csv', usecols = ['Title', 'Summary'])

In [5]:
#remove all null values
udemy_df = udemy_df[udemy_df['Summary'].notna()]

In [7]:
#create a copy of the udemy_df 
course_df = udemy_df.copy()

In [8]:
#check to make sure all null values have been removed
course_df.isnull().sum()

Title      0
Summary    0
dtype: int64

In [10]:
#drop all duplicated values
course_df  = course_df.drop_duplicates()

In [11]:
#remove all non-ascii characters from the dataset
course_df = course_df[course_df['Summary'].map(lambda x: x.isascii())]

In [12]:
#A contractions dictionary from Wikipedia found on Stack Overflow for expanding contractions: 
#https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions_dict = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [13]:
#This code is code to expand contractions in text created by Abhishek Sharma:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/

#Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

#Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the reviews and titles for course_df; job_df doesn't include any contractions
course_df['Summary']= course_df['Summary'].apply(lambda x:expand_contractions(x))
course_df['Title']= course_df['Title'].apply(lambda x:expand_contractions(x))

### Apply Keyword Extractor YAKE and Vectorize It Using CountVectorizer

In [14]:
#import yake for keyword extraction, import cosine sim and countvec for word vectors and similarity measuring
import yake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
#specify parameters for yake and create the keyword extractor
language = 'en'
max_ngram_size = 2
deduplication_threshold = 0.9
numOfKeywords = 35

custom_kw_extractor = yake.KeywordExtractor(lan = language, n = max_ngram_size,
                                            dedupLim = deduplication_threshold,
                                            top = numOfKeywords, features=None)

In [16]:
#create a function to extract keywords
extract_keywords = lambda x: [k[0] for k in custom_kw_extractor.extract_keywords(x)]

In [17]:
#extract keywords
start_time = time.time()

course_df['keywords'] = course_df['Summary'].apply(extract_keywords)

print("Time takes to extract keywords: ", round((time.time() - start_time)/60, 2), " mins")

Time takes to extract keywords:  5.27  mins


### Get Similarity Between Courses

In [19]:
#create word vectors from course_df, 
count = CountVectorizer(max_df = 0.7,
                        max_features = 10000,
                        ngram_range = (1,2),
                        lowercase = False,
                        tokenizer= lambda i:i,
                       )
count_matrix = count.fit_transform(course_df['keywords'])

countvec_df = pd.DataFrame(count_matrix.toarray(), columns= count.get_feature_names())

countvec_df.index = course_df['Title']

In [20]:
#get similarity between courses, and create a df for recommender use
start_time = time.time()

cosine_similarity_array = cosine_similarity(countvec_df)

cosine_similarity_df = pd.DataFrame(cosine_similarity_array, columns=countvec_df.index, index=countvec_df.index)

print("Time takes to make cosine_similarity_df: ", round((time.time() - start_time)/60, 2), " mins")

Time takes to make cosine_similarity_df:  3.03  mins


In [21]:
#vectorize summary data for cosine similarity comparison against keywords
count2 = CountVectorizer(
                        ngram_range = (1,2),
                        lowercase = False,
                       )

vectorized_data = count2.fit_transform(course_df['Summary'])

### Creating Recommender Model

In [23]:
def recommender(job_name,num):
    #create a vector of the inputted job title using the vector from above.
    query_vec = count2.transform([job_name])
    
    # calculate similarity with vectorized course summary
    similarity = cosine_similarity(query_vec, vectorized_data).flatten() 
    
    #create partitions in the data to make prediction faster
    indices = np.argpartition(similarity, -1)[-5:]
    
    #grab the course name that has the best similarity scores for the given job title
    course_name = course_df.iloc[indices][::-1]['Title'].values[0]
    
    #grab all matches to course names in cosine_similarity df
    cosine_similarity_series = cosine_similarity_df.loc[course_name]
    
    #order values in descending order
    ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
    
    #take the top num of values in descending order
    course_list = ordered_similarities[1:num+1]
    
    return course_list

In [24]:
start_time = time.time()
print(recommender('Registered Nurse', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
HP LoadRunner 12.55                                             0.734847
WebServices Performance Testing Using Loadrunner(SOAP &REST)    0.690268
Loadrunner 12.50 SAPGUI Protocol scripting -No Access to lab    0.670820
Performance Testing using LoadRunner 12.50                      0.593442
Pass the Real Estate Salesperson Exam                           0.507093
Name: Bitcoin and CryptoCurrency Jump Start Course, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [25]:
start_time = time.time()
print(recommender('Translator', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
Become an eBay Entrepreneur- Selling Clothes Online           0.447214
Freelance Work From Home: Top 5 Freelancing Skills            0.421637
Digital Nomad How-to Guide: Remote Work & Travel The World    0.400000
How to Become a Medical Transcriptionist                      0.381385
Seth Godin's Freelancer Course                                0.365148
Name: Work From Home in Translation | Upwork Translation Course, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [26]:
start_time = time.time()
print(recommender('Software Engineer', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
PG Diploma in Piping Design Engineering    0.554700
How to Design Your Dream Kitchen           0.480384
Plumbing System Design Basics (MEP)        0.462250
The Fundamentals of Project Management     0.461538
Electrical Designing Basics (MEP)          0.438529
Name: The Complete Software Engineering Course for Beginners, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [27]:
start_time = time.time()
print(recommender('Driver', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
How to Create a Brilliant Newsletter People Want to Read?       0.500000
Leer betere teksten schrijven, ga leukere content schrijven     0.500000
How to live stream, step by step with Open Broadcaster (OBS)    0.447214
Bubble Milk Tea Basic Business Training                         0.353553
Create & Sell Courses Online (Course Creator's Sales Funnel)    0.353553
Name: Track Any HTML5 Content with Custom SCORM, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [28]:
start_time = time.time()
print(recommender('Retail Salesperson', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
Find & Sell Products With Amazon FBA | Beginners Tutorial       0.707107
Amazon FBA Arbitrage: Make Money Without Private Labeling       0.617213
Retail Arbitrage                                                0.547723
Updated: Complete fast-track Amazon FBA beginner course 2020    0.544331
Amazon FBA Suspension Prevention Course                         0.544331
Name: Arbitrage 101: The Amazon Course, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [29]:
start_time = time.time()
print(recommender('Project Manager', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
Project Management: From Beginner to "Project Manager"         0.559017
Management Crash Course: Tactical Training for New Managers    0.547723
Financial Accounting for Beginners                             0.534522
Learn Project Management with Photoshop 2020                   0.534522
Executive Strategy & Management                                0.530330
Name: Project Management: Getting Started and Beyond, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [30]:
start_time = time.time()
print(recommender('DevOps Engineer', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
Learn DevOps: Infrastructure Automation With Terraform         0.462910
Amazon Web Services(AWS) for Beginners                         0.408248
Docker from A to Z™: Swarm + Jenkins                           0.408248
Kubernetes + Docker Complete Course - 2 in 1 Hands On!         0.365148
Terraform indepth(2020) - With 10 Realworld Job Casestudies    0.365148
Name: DevOps Tutorial: Complete Beginners Training - 5 in 1 Bundle, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [31]:
start_time = time.time()
print(recommender('Warehouse Associate', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
Microsoft Azure: Storage and Database                           0.612372
Implementing a Data Warehouse with SQL Server 2012              0.609272
Data Warehousing for beginners.                                 0.522233
Pentaho Data Integration(PDI) Fundamentals and DWH Concepts     0.522233
Microsoft SQL Server 2012 Certification Training Exam 70-463    0.476290
Name: Database Services on Microsoft AZURE, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [32]:
start_time = time.time()
print(recommender('Cashier', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
 Neuroscience Marketing & Persuasion [OUT NOW]            0.377964
Complete Project management crash course                  0.377964
TikTok Anti Shadowban Class                               0.377964
The Data Science & Machine Learning Bootcamp in Python    0.377964
Learn Brave                                               0.377964
Name: Become Top Banker With Complete Bank Teller Training - 2019, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [33]:
start_time = time.time()
print(recommender('Customer Service Representation', 5))
print("Time takes recommender to recommend: ", round((time.time() - start_time)/60, 2), " mins")

Title
How to Measure Customer Service                              0.866025
World Class Customer Service                                 0.750000
Customer Service Mastery: Delight Every Customer             0.666667
Perfect Customer Service Template: Easily Train Your Team    0.666667
Start Improving Customer Service                             0.577350
Name: Customer Service Basics, dtype: float64
Time takes recommender to recommend:  0.0  mins


In [49]:
recommender('Data Science', 5)

Title
Data Science Masterclass With R! 4 Projects+8 Case Studies    0.730297
Learn Data Science From Scratch                               0.654654
Learn Data Science Basics                                     0.589256
R programming from Scratch & Practice Case Studies workout    0.583333
Hands-on Tableau-10: Data Science Case Studies In Tableau     0.583333
Name: Data Science and Machine Learning Masterclass with R, dtype: float64

In [50]:
recommender('Administrative Assistant', 5)

Title
Sentiment analysis for chatbots - DialogFlow, IBM Watson        0.572078
Building Chatbots with Amazon Lex and IBM Watson                0.426401
IBM Watson for Artificial Intelligence & Cognitive Computing    0.402015
Comprehensive Guide to Artificial Intelligence(AI) for All      0.363636
Machine Learning with AWS AI and IBM Watson                     0.334497
Name: Building Chatbots with IBM Watson Assistant: End-to-End, dtype: float64