In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


## Content Based Filtering

In [5]:
#https://github.com/ry05/couReco/blob/master/recommender.py
#https://github.com/jalajthanaki/Movie_recommendation_engine/blob/master/Movie_recommendation_engine.ipynb

### Based on Description and Taglines
### https://github.com/jalajthanaki/Movie_recommendation_engine/blob/master/Movie_recommendation_engine.ipynb

In [6]:
course_dataset.head(2)

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [7]:
user_dataset.head(2)

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."


### User Dataset - Key Skills 

In [8]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(user_dataset['key_skills_str'].values.astype('U'))
tfidf_matrix.shape


(1097, 2065)

In [9]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.02408797, 0.        , ..., 0.06121443, 0.01629229,
       0.007909  ])

In [10]:
user_dataset = user_dataset.reset_index()
titles = user_dataset['userid']
indices = pd.Series(user_dataset.index, index=user_dataset['userid'])
indices.head(2)

userid
1001    0
1002    1
dtype: int64

In [11]:
def get_recommendations(title): #this is getting similar users based on their career objectives???
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_recommendations("1001").head(10)

894     1847
996     1946
981     1931
256     1231
868     1821
970     1920
1089    2039
714     1667
417     1375
493     1444
Name: userid, dtype: object

In [12]:
get_recommendations("1847").head(10)

996    1946
981    1931
0      1001
417    1375
256    1231
322    1292
714    1667
112    1104
201    1182
521    1473
Name: userid, dtype: object

In [13]:
get_recommendations("1946").head(10)

996    1946
981    1931
0      1001
417    1375
256    1231
322    1292
714    1667
112    1104
201    1182
521    1473
Name: userid, dtype: object

###  User Dataset - Career Objective

In [14]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(user_dataset['career_objective'].values.astype('U'))
tfidf_matrix.shape

(1097, 4673)

In [15]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]


array([1.        , 0.        , 0.        , ..., 0.00592012, 0.        ,
       0.0596521 ])

In [16]:
user_dataset = user_dataset.reset_index()
titles = user_dataset['userid']
indices = pd.Series(user_dataset.index, index=user_dataset['userid'])
indices.head(2)

userid
1001    0
1002    1
dtype: int64

In [17]:
def get_recommendations(title): #this is getting similar users based on their career objectives???
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_recommendations("1001").head(10)

894     1847
996     1946
93      1087
110     1102
180     1161
20      1021
914     1867
602     1555
801     1754
1022    1972
Name: userid, dtype: object

In [18]:
user_dataset["career_objective"].loc[user_dataset['userid'] == '1001']

0    Computer Engineering student with good technic...
Name: career_objective, dtype: object

In [19]:
user_dataset.loc[user_dataset['userid'] == '1847']

Unnamed: 0.1,level_0,index,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
894,894,894,842,1847,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [20]:
user_dataset.loc[user_dataset['userid'] == '1087']

Unnamed: 0.1,level_0,index,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
93,93,93,86,1087,B.E.,Computer Science & Engineering,"['English', ' Marathi ', ' Hindi']","['Java', ' Python', ' Machine Learning', ' CPP...",Dedicated and passionate computer engineering ...,"Java, Python, Machine Learning, CPP, Andro..."


### User Dataset - career objective and key skills


In [21]:
#creating the dataset

cf_dataset = pd.DataFrame()
cf_dataset['userid'] = user_dataset['userid']
cf_dataset['career_objective'] = user_dataset['career_objective']
cf_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_dataset.head(5)

Unnamed: 0,userid,career_objective,key_skills_str
0,1001,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,Missing,Missing
3,1004,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [22]:
s = cf_dataset.apply(lambda x: pd.Series(x['career_objective']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'career_objective'
s = s.value_counts()
s[:5]

Missing                                                                                                                                                                                                                                      570
To secure a position where I can efficiently contribute my skills and abilities to the growth of the organization and build my professional career.                                                                                            7
To pursue a challenging career and be a part of progressive organization that gives a scope to enhance my knowledge and utilizing my skills towards the growth of the organization.                                                            5
Looking for a challenging role in a reputable organization to utilize my technical, database, and management skills for the growth of the organization as well as to enhance my knowledge about new and emerging trends in the IT sector.      4
Computer Engineering student with go

In [23]:
cf_dataset['all'] = cf_dataset['career_objective'] + cf_dataset['key_skills_str']
cf_dataset

Unnamed: 0,userid,career_objective,key_skills_str,all
0,1001,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
1,1002,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",Interested in working under company offering A...
2,1003,Missing,Missing,MissingMissing
3,1004,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",Currently a final year student of Computer Eng...
4,1005,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",To have a growth oriented and challenging care...
...,...,...,...,...
1092,2042,i have to be carrer in programming and after t...,"java, database, html, OOPs, Core Java, MySQL",i have to be carrer in programming and after t...
1093,2043,To work in an Industry with a professional wor...,"Ability-to-cope-up-with-different-situation., ...",To work in an Industry with a professional wor...
1094,2044,To pursue a highly challenging and creative ca...,"C, Java, cpp, HTML, Basic-Python, MySQL",To pursue a highly challenging and creative ca...
1095,2045,To prove myself dedicated worthful and energet...,"C, Drupal-(CMS), Bootstrap, Wordpress-(CMS)...",To prove myself dedicated worthful and energet...


In [24]:
#vectorize 
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(cf_dataset['all'].values.astype('U'))

#cosine similarity using linear kernel
cosine_sim = cosine_similarity(count_matrix, count_matrix)

#
cf_dataset = cf_dataset.reset_index()
users = cf_dataset['userid']
#print("Users: " + users)
indices = pd.Series(cf_dataset.index, index=cf_dataset['userid'])
indices.head(10)

#function to get recommendations
def get_recommendations(user): #this is getting similar users based on their career objectives???
    idx = indices[user]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    user_indices = [i[0] for i in sim_scores]
    return users.iloc[user_indices]

In [25]:
indices #gives an index to each id

userid
1001       0
1002       1
1003       2
1004       3
1005       4
        ... 
2042    1092
2043    1093
2044    1094
2045    1095
2046    1096
Length: 1097, dtype: int64

In [26]:
print(indices[indices == 894]) #checking the index and id to confirm it is the same as given in the recommendations below

userid
1847    894
dtype: int64


In [27]:
recs = get_recommendations("1001")
recs.head(10)

894    1847
996    1946
93     1087
256    1231
201    1182
180    1161
91     1085
464    1416
981    1931
165    1146
Name: userid, dtype: object

In [28]:
get_recommendations("1847").head(10)

996    1946
0      1001
93     1087
981    1931
110    1102
256    1231
180    1161
91     1085
165    1146
201    1182
Name: userid, dtype: object

In [29]:
get_recommendations("1946").head(10)

996    1946
0      1001
93     1087
981    1931
110    1102
256    1231
180    1161
91     1085
165    1146
201    1182
Name: userid, dtype: object

In [30]:
cf_dataset["key_skills_str"].loc[cf_dataset['userid'] == '1001']

0    C,  Java,  Keras,  Flask,  Deep Learning,  Sel...
Name: key_skills_str, dtype: object

In [31]:
cf_dataset["all"].loc[cf_dataset['userid'] == '1847']

894    Computer Engineering student with good technic...
Name: all, dtype: object

In [32]:
cf_dataset["all"].loc[cf_dataset['userid'] == '1946']

996    Computer Engineering student with good technic...
Name: all, dtype: object

In [33]:
cf_dataset["all"].loc[cf_dataset['userid'] == '1231']

256    To enhance my expertise in the field of softwa...
Name: all, dtype: object

Upon checking the above, the filter is checking all the users' career objectives and key skill. And returns similar users. 

User 1001 has career objective a and key skills list b. 

User 1847 (the first recommendation for user 1001) has career objective a and key skills list b.

User 1847 (the second recommendation for user 1001) has career objective a and key skills list b.

User 1231 (the fourth recommendation for user 1001) has career objective NEW and key skills list NEW with some matches like Keras.

### Course Dataset -  degree 1, degree 1 specialization and key skills

In [34]:
#creating the dataset
cf_dataset2 = pd.DataFrame()
cf_dataset2['sr_'] = course_dataset['sr_']
cf_dataset2['degree_1'] = course_dataset['degree_1']
cf_dataset2['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_dataset2['key_skills_str'] = course_dataset['key_skills_str']
cf_dataset2.head(5)

Unnamed: 0,sr_,degree_1,degree_1_specializations,key_skills_str
0,1001,B.E.,Mechanical,CATIA
1,1002,B.E.,Mechanical,CATIA
2,1003,B.E.,Mechanical,CATIA
3,1004,B.E.,Mechanical,CATIA
4,1005,B.E.,Mechanical,CATIA


In [35]:
cf_dataset2['all'] = cf_dataset2['degree_1'] + cf_dataset2['degree_1_specializations'] + cf_dataset2['key_skills_str']
cf_dataset2

Unnamed: 0,sr_,degree_1,degree_1_specializations,key_skills_str,all
0,1001,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
1,1002,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
2,1003,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
3,1004,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
4,1005,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
...,...,...,...,...,...
9995,10996,B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
9996,10997,B.E.,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",B.E.Electronics Telecommunication Engineering...
9997,10998,M TeCh,Electronics Telecommunication Engineering,"EmbeddedC, MATLAB, Cprogramming, Keil",M TeCh Electronics Telecommunication Engineer...
9998,10999,B.E.,Electronics Telecommunication Engineering,"AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",B.E.Electronics Telecommunication Engineering...


In [36]:
#vectorize 
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(cf_dataset2['all'].values.astype('U'))

#cosine similarity using linear kernel
cosine_sim = cosine_similarity(count_matrix, count_matrix)

#
cf_dataset2 = cf_dataset2.reset_index()
courses = cf_dataset2['sr_']
indices = pd.Series(cf_dataset2.index, index=cf_dataset2['sr_'])


#function to get recommendations
def get_recommendations(course): 
    idx = indices[course]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    return courses.iloc[course_indices]

In [37]:
get_recommendations(1001).head(10)

1     1002
2     1003
3     1004
4     1005
5     1006
6     1007
7     1008
8     1009
9     1010
10    1011
Name: sr_, dtype: int64

In [38]:
get_recommendations(1003).head(10)

1     1002
2     1003
3     1004
4     1005
5     1006
6     1007
7     1008
8     1009
9     1010
10    1011
Name: sr_, dtype: int64

In [39]:
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 1001])  #key_skills_str

0    B.E.MechanicalCATIA
Name: all, dtype: object


In [40]:
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 1002])
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 1003])

1    B.E.MechanicalCATIA
Name: all, dtype: object
2    B.E.MechanicalCATIA
Name: all, dtype: object


In [41]:
#trying a different course
get_recommendations(10996).head(10)

172    1173
209    1210
212    1213
213    1214
214    1215
225    1226
226    1227
230    1231
231    1232
435    1436
Name: sr_, dtype: int64

In [42]:
print(cf_dataset2["all"].loc[cf_dataset2['sr_'] == 10996])
print(cf_dataset2["key_skills_str"].loc[cf_dataset2['sr_'] == 1173])
print(cf_dataset2["key_skills_str"].loc[cf_dataset2['sr_'] == 1210])

9995    B.E.Electronics  Telecommunication Engineering...
Name: all, dtype: object
172    EmbeddedC, MATLAB, Cprogramming, Keil
Name: key_skills_str, dtype: object
209    EmbeddedC, MATLAB, Cprogramming, Keil
Name: key_skills_str, dtype: object


They are also accurate in the course dataset. Only the degree, degree specializations and key skils are basically the same.

Try to implement the removal of the course we are looking for from the recommendations. As seen above in the 1003 course search, the thing we search for is sometimes given back

###  User Dataset but get Course Index

In [43]:
user_dataset.head(2)

Unnamed: 0.1,level_0,index,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."


In [44]:
#creating a new dataset dataset using the career objetcive, key skills and their user id
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

Unnamed: 0,userid,career_objective,key_skills_str,description
0,1001,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
1,1002,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",Interested in working under company offering A...
2,1003,Missing,Missing,MissingMissing
3,1004,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",Currently a final year student of Computer Eng...
4,1005,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",To have a growth oriented and challenging care...


In [76]:
#vectorize using countvectorize that converts into a matrix of token counts
user_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
user_count_matrix = user_count.fit_transform(cf_user_dataset['description'].values.astype('U'))

#cosine similarity between the the matrix itself (normalized dot product of X and Y)
user_cosine_sim = cosine_similarity(user_count_matrix, user_count_matrix)

#create indices for the user is using series
#cf_user_dataset = cf_user_dataset.reset_index()
user_ids = cf_user_dataset
indices = pd.Series(course_dataset.index, index=course_dataset['sr_'])


#function to get content-filtered recommendations
def get_user_cf_recommendations(user):
    
    #get index of user
    user_id = indices[user]
    
    #find the most similar 30 users using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(user_cosine_sim[user_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    user_indices = [i[0] for i in sim_scores]
    
    return user_ids.iloc[user_indices]

In [77]:
#getting the similar users recs for user 1001
get_user_cf_recommendations(1001) #the input is a course, the output is similar users

Unnamed: 0,level_0,index,userid,career_objective,key_skills_str,description
894,894,894,1847,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
996,996,996,1946,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",Computer Engineering student with good technic...
93,93,93,1087,Dedicated and passionate computer engineering ...,"Java, Python, Machine Learning, CPP, Andro...",Dedicated and passionate computer engineering ...
256,256,256,1231,To enhance my expertise in the field of softwa...,"NLP, Java, Neural Networks, Keras, Python,...",To enhance my expertise in the field of softwa...
201,201,201,1182,Missing,"C, Data Analysis, Java, Neural Networks, D...","MissingC, Data Analysis, Java, Neural Netwo..."
180,180,180,1161,I am a student programmer currently seeking ex...,"Python, Data Science, Artificial Intelligenc...",I am a student programmer currently seeking ex...
91,91,91,1085,Team oriented individual with strong communica...,"C, Statistics, Java, MS Office, Python, D...",Team oriented individual with strong communica...
464,464,464,1416,Missing,"Database, Embedded C, Machine Learning","MissingDatabase, Embedded C, Machine Learning"
981,981,981,1931,Missing,"Deep Learning, Selenium, Cpp","MissingDeep Learning, Selenium, Cpp"
165,165,165,1146,Missing,"Data Structures, Android, Algorithms, Probl...","MissingData Structures, Android, Algorithms,..."


In [47]:
print(cf_user_dataset["userid"].loc[course_dataset['sr_'] == 1001])
#description, key_skills_str, career_objective, userid

0    1001
Name: userid, dtype: object


In [48]:
print(cf_user_dataset["userid"].loc[course_dataset['sr_'] == 1895])


894    1847
Name: userid, dtype: object


In [49]:
print(cf_user_dataset["userid"].loc[course_dataset['sr_'] == 1997])


996    1946
Name: userid, dtype: object


In [50]:
print(cf_user_dataset["userid"].loc[course_dataset['sr_'] == 1094])


93    1087
Name: userid, dtype: object


### Test 

In [51]:
#creating a new dataset dataset using the career objetcive, key skills and their user id
cf_dataset = pd.DataFrame()
cf_dataset['userid'] = user_dataset['userid']
cf_dataset['career_objective'] = user_dataset['career_objective']
cf_dataset['key_skills_str'] = user_dataset['key_skills_str']

cf_dataset['sr_'] = course_dataset['sr_']
cf_dataset['degree_1'] = course_dataset['degree_1']
cf_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_dataset['course_key_skills_str'] = course_dataset['key_skills_str']
cf_dataset.head(5)

#store the career objective and key skills in description
cf_dataset['description'] = cf_dataset['career_objective'] + cf_dataset['key_skills_str']
cf_dataset['course_description'] = cf_dataset['degree_1'] + cf_dataset['degree_1_specializations'] + cf_dataset['course_key_skills_str']
cf_dataset.head(5)

Unnamed: 0,userid,career_objective,key_skills_str,sr_,degree_1,degree_1_specializations,course_key_skills_str,description,course_description
0,1001,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",1001,B.E.,Mechanical,CATIA,Computer Engineering student with good technic...,B.E.MechanicalCATIA
1,1002,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",1002,B.E.,Mechanical,CATIA,Interested in working under company offering A...,B.E.MechanicalCATIA
2,1003,Missing,Missing,1003,B.E.,Mechanical,CATIA,MissingMissing,B.E.MechanicalCATIA
3,1004,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",1004,B.E.,Mechanical,CATIA,Currently a final year student of Computer Eng...,B.E.MechanicalCATIA
4,1005,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",1005,B.E.,Mechanical,CATIA,To have a growth oriented and challenging care...,B.E.MechanicalCATIA


In [54]:
#vectorize using countvectorize that converts into a matrix of token counts
course_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
course_count_matrix = course_count.fit_transform(cf_dataset['course_description'].values.astype('U'))

#vectorize using countvectorize that converts into a matrix of token counts
user_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
user_count_matrix = user_count.fit_transform(cf_dataset['description'].values.astype('U'))


#cosine similarity between the the matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(course_count_matrix, user_count_matrix)

#create indices for the courses is using series
cf_course_dataset = cf_course_dataset.reset_index()
courses = cf_course_dataset['sr_']
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

#function to get content-filtered recommendations
def get_course_cf_recommendations(course): 
    
    #get index of course
    course_id = indices[course]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[course_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 135 while Y.shape[1] == 6920