In [52]:
!pip install neattext



### Import libraries

In [53]:
import pandas as pd
import numpy as np
import neattext.functions as nfx

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Read Data from CSV file

In [54]:
df = pd.read_csv("udemy_course_data.csv")
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13


### Check for null values

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   course_id            3683 non-null   int64 
 1   course_title         3683 non-null   object
 2   url                  3683 non-null   object
 3   is_paid              3683 non-null   bool  
 4   price                3683 non-null   int64 
 5   num_subscribers      3683 non-null   int64 
 6   num_reviews          3683 non-null   int64 
 7   num_lectures         3683 non-null   int64 
 8   level                3683 non-null   object
 9   content_duration     3683 non-null   object
 10  published_timestamp  3683 non-null   object
 11  subject              3683 non-null   object
 12  profit               3683 non-null   int64 
 13  published_date       3683 non-null   object
 14  published_time       3682 non-null   object
 15  year                 3683 non-null   int64 
 16  month 

### Remove Stop words and special characters

In [56]:
df['Clean_title'] = df['course_title'].apply(nfx.remove_stopwords)
df['Clean_title'] = df['Clean_title'].apply(nfx.remove_special_characters)

In [57]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,Clean_title
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18,Ultimate Investment Banking Course
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9,Complete GST Course Certification Grow Practice
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19,Financial Modeling Business Analysts Consultants
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30,Beginner Pro Financial Analysis Excel 2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13,Maximize Profits Trading Options


### Vectorize the Clean Title

In [58]:
countvect = CountVectorizer()
cvmat = countvect.fit_transform(df['Clean_title'])
cvmat

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 18364 stored elements and shape (3683, 3564)>

### Compute the Cosine Similarity

In [59]:
cos_sim = cosine_similarity(cvmat)
cos_sim

array([[1.        , 0.20412415, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20412415, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.23570226],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 0.        ,
        1.        ]])

In [60]:
cos_sim.shape

(3683, 3683)

### Recommend Course

In [61]:
title = "How To Maximize Your Profits Trading Options"

In [62]:
course_index = pd.Series(df.index, index=df['course_title']).drop_duplicates()

In [63]:
index = course_index[title]

In [64]:
course_index[title]

np.int64(4)

In [65]:
scores = cos_sim[index]
scores = list(enumerate(scores))

In [66]:
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

In [69]:
selected_course_index = [i[0] for i in sorted_scores[1:]]
selected_course_score = [i[1] for i in sorted_scores[1:]] 

In [84]:
rec_df = df.iloc[selected_course_index].copy()
rec_df['Similarity Score'] = selected_course_score

In [86]:
final_recommended_courses = rec_df[rec_df['Similarity Score'] > 0.5]

In [89]:
final_recommended_courses[['course_title', 'Similarity Score', 'url', 'price', 'num_subscribers']]

Unnamed: 0,course_title,Similarity Score,url,price,num_subscribers
410,Trading Options Basics,0.57735,https://www.udemy.com/trading-options-basics/,200,8
43,Options Trading - How to Win with Weekly Options,0.566947,https://www.udemy.com/work-from-home-setup-you...,115,7489
96,Intermediate Options trading concepts for Stoc...,0.53033,https://www.udemy.com/intermediate-options-tra...,40,2000
138,Forex Trading with Fixed 'Risk through Options...,0.53033,https://www.udemy.com/forexoptions/,200,611
195,Trading Options For Consistent Returns: Option...,0.53033,https://www.udemy.com/trading-options-for-income/,0,4077
444,The Advantages of ETF Options and Index Option...,0.53033,https://www.udemy.com/learn-etf-options-and-in...,60,52
803,Options Spreads Bundle- the heart of Options ...,0.53033,https://www.udemy.com/options-spreads-explained/,120,623


In [91]:
df['price'].unique()

array([200,  75,  45,  95, 150,  65, 195,  30,  20,  50, 175, 140, 115,
       190, 125,  60, 145, 105, 155, 185, 180, 120,  25, 160,  40,   0,
       100,  90,  35,  80,  70,  55, 165, 130,  85, 170, 110, 135])

In [92]:
df['price'].astype('float')

0       200.0
1        75.0
2        45.0
3        95.0
4       200.0
        ...  
3678    100.0
3679     25.0
3680     40.0
3681     50.0
3682     45.0
Name: price, Length: 3683, dtype: float64