In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import squarify
import seaborn as sns
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split
import random

## Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


## Content Based Filtering

In [5]:
#https://github.com/ry05/couReco/blob/master/recommender.py
#https://github.com/jalajthanaki/Movie_recommendation_engine/blob/master/Movie_recommendation_engine.ipynb

### Based on Description and Taglines
### https://github.com/jalajthanaki/Movie_recommendation_engine/blob/master/Movie_recommendation_engine.ipynb

In [6]:
course_dataset.head(2)

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [7]:
user_dataset.head(2)

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."


### User Dataset - Key Skills 

In [8]:

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(user_dataset['key_skills_str'].values.astype('U'))
tfidf_matrix.shape


(1097, 2065)

In [9]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]


array([1.        , 0.02408797, 0.        , ..., 0.06121443, 0.01629229,
       0.007909  ])

In [10]:
user_dataset = user_dataset.reset_index()
titles = user_dataset['userid']
indices = pd.Series(user_dataset.index, index=user_dataset['userid'])
indices.head(2)

userid
1001    0
1002    1
dtype: int64

In [11]:
def get_recommendations(title): #this is getting similar users based on their career objectives???
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_recommendations("1001").head(10)

894     1847
996     1946
981     1931
256     1231
868     1821
970     1920
1089    2039
714     1667
417     1375
493     1444
Name: userid, dtype: object

In [12]:
get_recommendations("1847").head(10)

996    1946
981    1931
0      1001
417    1375
256    1231
322    1292
714    1667
112    1104
201    1182
521    1473
Name: userid, dtype: object

In [13]:
get_recommendations("1946").head(10)

996    1946
981    1931
0      1001
417    1375
256    1231
322    1292
714    1667
112    1104
201    1182
521    1473
Name: userid, dtype: object

###  User Dataset - Career Objective

In [14]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(user_dataset['career_objective'].values.astype('U'))
tfidf_matrix.shape

(1097, 4673)

In [15]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]


array([1.        , 0.        , 0.        , ..., 0.00592012, 0.        ,
       0.0596521 ])

In [16]:
user_dataset = user_dataset.reset_index()
titles = user_dataset['userid']
indices = pd.Series(user_dataset.index, index=user_dataset['userid'])
indices.head(2)

userid
1001    0
1002    1
dtype: int64

In [17]:
def get_recommendations(title): #this is getting similar users based on their career objectives???
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_recommendations("1001").head(10)

894     1847
996     1946
93      1087
110     1102
180     1161
20      1021
914     1867
602     1555
801     1754
1022    1972
Name: userid, dtype: object

In [37]:
user_dataset["career_objective"].loc[user_dataset['userid'] == '1001']

0    Computer Engineering student with good technic...
Name: career_objective, dtype: object

In [32]:
user_dataset.loc[user_dataset['userid'] == '1847']

Unnamed: 0.1,level_0,index,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
894,894,894,842,1847,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [41]:
user_dataset.loc[user_dataset['userid'] == '1087']

Unnamed: 0.1,level_0,index,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
93,93,93,86,1087,B.E.,Computer Science & Engineering,"['English', ' Marathi ', ' Hindi']","['Java', ' Python', ' Machine Learning', ' CPP...",Dedicated and passionate computer engineering ...,"Java, Python, Machine Learning, CPP, Andro..."


### Course Dataset - degree 1, degree 1 specialization and key skills

In [44]:
course_dataset.head(2)

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [43]:
df = pd.DataFrame()
df["details"] = course_dataset["degree_1"] + course_dataset["degree_1_specializations"] + course_dataset["key_skills_str"]

In [50]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['details'].values.astype('U'))

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

df = df.reset_index()
titles = course_dataset['sr_']
indices = pd.Series(course_dataset.index, index=course_dataset['sr_'])
indices.head(2)

sr_
1001    0
1002    1
dtype: int64

In [51]:
def get_recommendations(title): #this is getting similar users based on their career objectives???
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

get_recommendations("1002").head(10)

KeyError: '1002'