In [2]:
import pandas as pd
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#!pip install neattext

In [3]:
data = pd.read_csv('udemy_courses.csv')

In [4]:
data.columns

Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

In [5]:
data.head(1)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance


In [6]:
data.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

In [7]:
data.duplicated().any()

np.True_

In [8]:
data[data.duplicated()]

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
788,1157298,Introduction to Forex Trading Business For Beg...,https://www.udemy.com/introduction-to-forex-tr...,True,20,0,0,27,Beginner Level,1.5,2017-04-23T16:19:01Z,Business Finance
894,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1.0,2016-12-15T14:56:17Z,Business Finance
1100,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5,2017-07-02T14:29:35Z,Business Finance
1473,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,0.616667,2014-04-15T21:48:55Z,Graphic Design
2561,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4.0,2013-01-03T00:55:31Z,Web Development


In [9]:
data = data.drop_duplicates()

In [10]:
data.shape

(3672, 12)

In [11]:
data['course_title']

0                      Ultimate Investment Banking Course
1       Complete GST Course & Certification - Grow You...
2       Financial Modeling for Business Analysts and C...
3       Beginner to Pro - Financial Analysis in Excel ...
4            How To Maximize Your Profits Trading Options
                              ...                        
3673    Learn jQuery from Scratch - Master of JavaScri...
3674    How To Design A WordPress Website With No Codi...
3675                        Learn and Build using Polymer
3676    CSS Animations: Create Amazing Effects on Your...
3677    Using MODX CMS to Build Websites: A Beginner's...
Name: course_title, Length: 3672, dtype: object

In [12]:
data.columns

Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'published_timestamp', 'subject'],
      dtype='object')

### Popularity-based recommendation system 

In [17]:
def popularity_based_recommendation(df,top_n=10):
    # Calculate popularity score for each course
    data['popularity_score'] = 0.6 * data['num_subscribers'] + 0.4 * data['num_reviews']
    
    # Sort courses by popularity score in descending order
    df_sorted = data.sort_values(by='popularity_score', ascending=False)
    
    # Return the recommended courses (course titles and popularity scores)
    recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)
    
    return recommended_courses

In [18]:
popularity_based_recommendation(data)

Unnamed: 0,course_title,popularity_score
2827,Learn HTML5 Programming From Scratch,164805.4
3032,Coding for Entrepreneurs Basic,96729.0
3230,The Web Developer Bootcamp,83928.4
3232,The Complete Web Developer Course 2.0,77672.0
2783,Build Your First Website in 1 Week with HTML5 ...,74544.2
2589,Web Design for Web Developers: Build Beautiful...,61925.0
1896,Free Beginner Electric Guitar Lessons,61109.2
3247,JavaScript: Understanding the Weird Parts,54557.6
3204,Angular 4 (formerly Angular 2) - The Complete ...,52129.4
3289,Practical PHP: Master the Basics and Code Dyna...,52081.4


### Content-Based Recommendation System

In [19]:
data['course_title'] = data['course_title'].apply(nfx.remove_stopwords)
data['course_title']  =data['course_title'].apply(nfx.remove_special_characters)

In [20]:
data.sample(5)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,popularity_score
963,160322,Basics Finance Budgeting,https://www.udemy.com/basics-of-finance-and-bu...,True,20,578,7,34,Beginner Level,6.0,2014-05-19T04:02:10Z,Business Finance,349.6
3506,737468,Create Local Wordpress Environment,https://www.udemy.com/how-to-create-a-local-wo...,False,0,4306,163,12,Beginner Level,0.516667,2016-02-07T18:58:40Z,Web Development,2648.8
340,363404,FuturesCommodity Training Basics,https://www.udemy.com/commodity/,False,0,4863,202,6,Beginner Level,1.5,2015-02-07T07:04:07Z,Business Finance,2998.6
1011,1153854,Working Capital assessment bankers credit anal...,https://www.udemy.com/working-capital-assessme...,True,20,4,0,29,Beginner Level,7.0,2017-04-06T23:30:07Z,Business Finance,2.4
2277,854694,Instant Harmonica play Star Wars Lion Sleeps...,https://www.udemy.com/instant-harmonica-play-s...,True,40,33,3,20,All Levels,1.5,2016-08-13T12:50:31Z,Musical Instruments,21.0


In [21]:
data['title_subject']  =data['course_title'] +' '+data['subject']

In [22]:
cv = CountVectorizer(max_features=3000)
vectors = cv.fit_transform(data['title_subject']).toarray()

In [23]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(3000,))

In [27]:
#len(cv.get_feature_names())
len(cv.get_feature_names_out())


3000

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
similarity = cosine_similarity(vectors)

In [30]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(39, np.float64(0.7715167498104596)),
 (240, np.float64(0.6666666666666669)),
 (417, np.float64(0.6666666666666669)),
 (418, np.float64(0.6172133998483676)),
 (657, np.float64(0.6172133998483676))]

In [31]:
def recommend(course):
    # let's featch the index
    course_index = data[data['course_title']==course].index[0]
    distances = similarity[course_index]
    courses_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in courses_list:
        print(data.iloc[i[0]]['course_title'])

In [32]:
#recommend("know HTML Learn HTML Basics")

In [33]:
recommend("know HTML Learn HTML Basics")

WordPress Development Beginners
Wordpress Theme Development Beginners
Wordpress beginners Build Websites Fast Coding
Website Coding WordPress  Web Skills
Kids Coding  Beginners CSS


In [34]:
data.iloc[39]['course_title']

'Complete Investment Banking Course 2017'

In [35]:
#sorted(similarity[0],reverse=True)

In [30]:
import pickle

In [31]:
#pickle.dump(data.to_dict(),open('course_dict.pkl','wb'))
pickle.dump(data,open('course_dict.pkl','wb'))

In [32]:
pickle.dump(similarity,open('similarity.pkl','wb'))