In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import squarify
import seaborn as sns
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split
import random

## Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


## Collaborative Filtering

In [5]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

####  SVD CF Recommender

In [6]:
def svd_cf_recommendations(user):
    
    #usisng the "best" parameters found using the gridsearchcv method from experiments notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset)
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    
    #save into dataframe
    rating_predictions = pd.DataFrame(predictions)
    
    #find the top 30 predictions for given user
    user_predictions = rating_predictions[rating_predictions['uid'] == user].\
                         sort_values(by="est", ascending = False).head(30)
    
    #create recommendations array and append user predictions as recommendations
    recommendations = []
    recommendations.append(list(user_predictions['iid']))
    recommendations = recommendations[0]
    
    return(recommendations)

## Content Based Filtering

### User Dataset - Based on Career Objective and Key Skills

In [7]:
#creating a new dataset dataset using the career objetcive, key skills and their user id
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
#cf_user_dataset.head(5)

In [8]:
#vectorize using countvectorize that converts into a matrix of token counts
user_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
user_count_matrix = user_count.fit_transform(cf_user_dataset['description'].values.astype('U'))

#cosine similarity between the the matrix itself (normalized dot product of X and Y)
user_cosine_sim = cosine_similarity(user_count_matrix, user_count_matrix)

#create indices for the user is using series
cf_user_dataset = cf_user_dataset.reset_index()
user_ids = cf_user_dataset
indices = pd.Series(cf_user_dataset.index, index=cf_user_dataset['userid'])


#function to get content-filtered recommendations
def get_user_cf_recommendations(user):
    
    #get index of user
    user_id = indices[user]
    
    #find the most similar 30 users using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(user_cosine_sim[user_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    user_indices = [i[0] for i in sim_scores]
    
    return user_ids.iloc[user_indices]

### Course Dataset - Based on Degree 1, Degree 1 Specialization and Key Skills

In [9]:
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

Unnamed: 0,sr_,degree_1,degree_1_specializations,key_skills_str,description
0,1001,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
1,1002,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
2,1003,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
3,1004,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA
4,1005,B.E.,Mechanical,CATIA,B.E.MechanicalCATIA


In [10]:
#vectorize using countvectorize that converts into a matrix of token counts
course_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
course_count_matrix = course_count.fit_transform(cf_course_dataset['description'].values.astype('U'))

#cosine similarity between the the matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(course_count_matrix, course_count_matrix)

#create indices for the courses is using series
cf_course_dataset = cf_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

#function to get content-filtered recommendations
def get_course_cf_recommendations(course): 
    
    #get index of course
    course_id = indices[course]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[course_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

## The Hybrid Recommender 

In [11]:
#svd_cf_recommendations(user) #in: user, out:course
#get_user_cf_recommendations(user) #in user, out: user
#get_course_cf_recommendations(course) #in course, out: course

In [48]:
def hybrid_recommender(user):
    
    svd_recs = [] #pd.DataFrame()
    user_recs = pd.DataFrame()
    course_recs = pd.DataFrame()
    
    #check with svd first 
    svd_recs = svd_cf_recommendations(user)
    
    
    #when svd dataset has no data on a user, recommendations will be made solely on the cf recommenders
    #gets similar users
    user_recs = get_user_cf_recommendations(user)
    
    #cross check courses with cf courser recommender
    
    #CURRENTLY GETTING USER NOT COURSE
    #get a random course?
    course = user#random.randint(1001, 11000)
    course_recs = get_course_cf_recommendations(course) 
    
    #get similarity between the course dataset and user using cosine thing
    #user
    user_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
    user_count_matrix = user_count.fit_transform(user_recs['key_skills_str'].values.astype('U'))
    
    #course
    course_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
    course_count_matrix = course_count.fit_transform(course_recs['key_skills_str'].values.astype('U'), y = user_count_matrix.shape[1])
    
    print(course_count_matrix.shape)
    print(user_count_matrix.shape)
    
    #to fix matrix shapes so they are equal    
    if(course_count_matrix.shape != user_count_matrix.shape):
        if(course_count_matrix.shape[1] < user_count_matrix.shape[1]):
            
            temp_user_m = user_count_matrix
            new_user_temp = temp_user_m[:, course_count_matrix.shape[1]] 

            new_course_temp = course_count_matrix
            print(new_user_temp.shape)
            print(course_count_matrix.shape)
            print(user_count_matrix.shape)
            
        else:
            
            temp_user_m = course_count_matrix
            new_course_temp = temp_user_m[:, user_count_matrix.shape[1]] 

            new_user_temp = user_count_matrix
            print(new_course_temp.shape)
            print(course_count_matrix.shape)
            print(user_count_matrix.shape)
        
    else:
        new_course_temp = course_count_matrix
        new_user_temp = user_count_matrix
    
    #cosine similarity between the the matrix itself (normalized dot product of X and Y)
    course_cosine_sim = cosine_distances(new_course_temp, new_user_temp)

    #create indices for the courses is using series
    course_recs = course_recs.reset_index()
    courses = course_recs['sr_']
    indices = pd.Series(course_recs.index, index=course_recs['sr_'])
    
    new_course_rec = get_course_cf_recommendations(course)
    
    
    return svd_recs, user_recs[["userid", "career_objective", "key_skills_str", "description"]], course_recs[["sr_", "key_skills_str", "description"]]

In [49]:
svd, cf, ccf = hybrid_recommender(1001)

(30, 1)
(30, 237)
(30, 1)
(30, 1)
(30, 237)


In [50]:
svd

[2180,
 2155,
 2391,
 2102,
 2283,
 2410,
 2202,
 2258,
 2076,
 2398,
 2240,
 2127,
 2086,
 2098,
 2285,
 2266,
 2290,
 2230,
 2261,
 2305,
 2361,
 2025,
 2386,
 2326,
 2162,
 2019,
 2122,
 2423,
 2375,
 2392]

In [51]:
course_dataset[course_dataset["sr_"] == 2090]

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1089,1089,2090,B.E.,Mechanical,MITCOE,"['AutoCAD, PROE']","AutoCAD, PROE"


## The Predictor 

### Get the course details for the collaborative filtering recs

In [52]:
#gets course details of svd recommendations
def get_course_details(svd_recs): 
    
    #dataframe for final recommendations 
    finalRecs = pd.DataFrame()
    
    #iterate the list
    for i in svd_recs:
        
        #if course found in couse dataset then store details in finalRecs dataframe
        if i in course_dataset["sr_"]:
            course_details = course_dataset[course_dataset["sr_"] == i]
            finalRecs = finalRecs.append(course_details)

            #drop unneccessary columns 
            finalRecs.drop(["Unnamed: 0", "key_skills"], axis = 1, inplace = True)
        
    #return useful information
    return finalRecs[["sr_", "degree_1", "degree_1_specializations", "campus", "key_skills_str"]]

In [53]:
#displaying
#get final recommendations
final = get_course_details(svd)

#rename index and set index to it
final = final.rename(columns = {final.columns[0]: "Course Code"})
final.set_index("Course Code", inplace = True)

#rename other columns
final.rename(columns = {"degree_1":"Degree", "degree_1_specializations": "Degree Specializations", "campus": "Campus", "key_skills": "Key Skills Actual", "key_skills_str": "Key Skills"}, inplace = True)

final.head(10)

Unnamed: 0_level_0,Degree,Degree Specializations,Campus,Key Skills
Course Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2180,B.E.,Civil Engineering,MITCOE,"MSCIT, Leadership, AUTOCAD"
2155,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL"
2391,B.E.,Mechanical,MITAOE,"AutoCAD, PROE"
2102,B.E.,Mechanical,MITCOE,"CAD,CAM"
2283,B.E.,Mechanical,MITCOE,"CAD,CAM"
2410,B.E.,Mechanical,MITAOE,"SOLIDWORKS, AUTOCAD, CREO"
2202,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","C,JAVA"
2258,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li..."
2076,B.E.,Mechanical,MITAOE,"SOLIDWORKS, AUTOCAD, CREO"
2398,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL"


### Will get the length of the keyskills to return the ratio percentages to calculate the success rate

In [54]:
#percentages of the three items
def get_percentage(length):
    
    if length == 0:
        pdegree = 0
        pspec = 0
        pkeyskills = 0
        print("There are no key skills!")
        
        return pdegree, pspec, pkeyskills
    
    if length == 1 or length == 2:
        pdegree = 30 
        pspec = 30
        pkeyskills = 40 
        
        return pdegree, pspec, pkeyskills
    
    elif length > 2 and length <= 4:
        pdegree = 25 
        pspec = 25
        pkeyskills = 50 
        
        return pdegree, pspec, pkeyskills
    
    elif length > 4 and length <= 9:
        pdegree = 20 
        pspec = 20
        pkeyskills = 60
        
        return pdegree, pspec, pkeyskills
    
    else:
        pdegree = 15 
        pspec = 15
        pkeyskills = 70

        return pdegree, pspec, pkeyskills

### Predictor

In [83]:
success_rate = []

#implement predictor using the user key skills and info and the predicted courses
def calculate_success_rate(user, recommendations):
    
    #get the users required info
    #user_details = user_dataset[user_dataset["userid"] == user]
    user_degree = user_dataset["degree_1"].loc[user_dataset["userid"] == user].values[0]
    user_spec = user_dataset["degree_1_specializations"].loc[user_dataset["userid"] == user].values[0]
    user_keyskills = user_dataset["key_skills_str"].loc[user_dataset["userid"] == user].values[0]

    #turn into list 
    user_degree = user_degree.split("-")
    user_spec = user_spec.split("-")
    user_keyskills = user_keyskills.split(",")
    user_keyskills = (map(lambda x: x.lower().strip(), user_keyskills))
    user_keyskills = list(user_keyskills)
    

    #get the recommmendations
    #compare the user details with the recommendations
    for i, row in recommendations.iterrows():
        #print course code
        #print("Checking Course ", i)
        
        #reset the percentages
        apdegree = 0
        apspec = 0
        apkeyskills = 0
        
        #get current course data
        curr_degree = row[0]
        curr_spec = row[1]
        curr_keyskills = row[3] #row 3 using final, might change to row 2 if we remove campus
        
        #turn into list 
        curr_degree = curr_degree.split("-")
        curr_spec = curr_spec.split("-")
        #curr_keyskills = curr_keyskills.split(",")
        curr_keyskills = curr_keyskills.strip().split(",")
        curr_keyskills = (map(lambda x: x.lower().strip(), curr_keyskills))
        curr_keyskills = list(curr_keyskills)

        #get percentages
        pdegree, pspec, pkeyskills = get_percentage(len(curr_keyskills))
        
        #Actual predictor
        #compare degree
        if (user_degree == curr_degree):
            #print("UD: ", user_degree)
            #print("CD: ", curr_degree)
            apdegree = pdegree
            
        #compare degree specs
        #print("US: ", user_spec)
        #print("CS: ", curr_spec)
        if (user_spec == curr_spec):
            
            apspec = pspec
        
        #compare keyskills 
        ptemp = pkeyskills / len(curr_keyskills) #this will be the percentages of each individual key skill
                
        for currk in curr_keyskills:   
            
            #in the case of missing currk
            if currk == "missing":
                print("There is not enough information!")

            #for case course 2258
            #course 2258 has a keyskill "C CPP" instead of "C" and "CPP". 
            #And its not being counted as a match since it takes them as one thing
            if currk == "c cpp":
                currk = "c"
                curr_keyskills.append("cpp")
                
                #adjust the percentages
                ptemp = pkeyskills / len(curr_keyskills)

            if currk in user_keyskills:
                apkeyskills = apkeyskills + ptemp
                #print("PASS")
                
        total = round(apdegree + apspec + apkeyskills, 2)

        #recommendations = recommendations[recommendations["Success Rate"]].append(success_rate)
        
        print("The Percentages: ", pdegree, pspec, pkeyskills)
        print("Course: ", i, "SuccessRate: ", total)
        success_rate.append(total)
            

In [84]:
calculate_success_rate("1001", final) 

The Percentages:  25 25 50
Course:  2180 SuccessRate:  25
The Percentages:  20 20 60
Course:  2155 SuccessRate:  90.0
The Percentages:  30 30 40
Course:  2391 SuccessRate:  30
The Percentages:  30 30 40
Course:  2102 SuccessRate:  30
The Percentages:  30 30 40
Course:  2283 SuccessRate:  30
The Percentages:  25 25 50
Course:  2410 SuccessRate:  25
The Percentages:  30 30 40
Course:  2202 SuccessRate:  70.0
The Percentages:  15 15 70
Course:  2258 SuccessRate:  25.77
The Percentages:  25 25 50
Course:  2076 SuccessRate:  25
The Percentages:  20 20 60
Course:  2398 SuccessRate:  90.0
The Percentages:  15 15 70
Course:  2240 SuccessRate:  25.77
The Percentages:  30 30 40
Course:  2127 SuccessRate:  50.0
The Percentages:  30 30 40
Course:  2086 SuccessRate:  30
The Percentages:  25 25 50
Course:  2098 SuccessRate:  25
The Percentages:  30 30 40
Course:  2285 SuccessRate:  30
The Percentages:  15 15 70
Course:  2266 SuccessRate:  25.77
The Percentages:  30 30 40
Course:  2290 SuccessRate:  

### Final Thing 

In [114]:
final['Success Rate'] = success_rate #this was checked and everything was saved fine

In [115]:
final.head()

Unnamed: 0_level_0,Degree,Degree Specializations,Campus,Key Skills,Success Rate
Course Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2180,B.E.,Civil Engineering,MITCOE,"MSCIT, Leadership, AUTOCAD",25.0
2155,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",90.0
2391,B.E.,Mechanical,MITAOE,"AutoCAD, PROE",30.0
2102,B.E.,Mechanical,MITCOE,"CAD,CAM",30.0
2283,B.E.,Mechanical,MITCOE,"CAD,CAM",30.0


### Testing

All of the success rate prediction results where checked manually for the user "1001". An image of the course recommendations and the the success rates will be saved for future reference if needed. Everything was calculated as desried

In [59]:
user = "1001"
user_details = user_dataset[user_dataset["userid"] == user]
user_details

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [113]:
course_dataset[course_dataset["sr_"] == 2392] 

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
1391,1391,2392,B.E.,Mechanical,MITCOE,"['AutoCAD, PROE']","AutoCAD, PROE"


In [None]:
get_percentage(len(curr_keyskills))

In [74]:
user_keyskills = user_dataset["key_skills_str"].loc[user_dataset["userid"] == user].values[0]

user_keyskills = user_keyskills.split(",")
user_keyskills = (map(lambda x: x.lower().strip(), user_keyskills))
user_keyskills = list(user_keyskills)
user_keyskills

['c',
 'java',
 'keras',
 'flask',
 'deep learning',
 'selenium',
 'cpp',
 'django',
 'python',
 'computer vision',
 'html',
 'mysql',
 'tensorflow',
 'machine learning',
 'web development']

In [109]:
curr_keyskills = course_dataset["key_skills_str"].loc[course_dataset["sr_"] == 2019].values[0]

#curr_keyskills = curr_keyskills.split(",")
curr_keyskills = curr_keyskills.strip().split(",")
curr_keyskills = (map(lambda x: x.lower().strip(), curr_keyskills))
curr_keyskills = list(curr_keyskills)
curr_keyskills

['loratechnology',
 'c',
 'mathematicaltoolmatlab',
 'basicsofembeddedcpython',
 'vhdl',
 'platformwindowsandunix',
 'edatoolmentorgraphics',
 'pspice',
 'arm7',
 'python',
 'windows']

In [87]:
print(len(curr_keyskills))
for currk in curr_keyskills:   
    print(currk)
    if currk == "missing":
        print("There is not enough information!")
            
    if currk == "c cpp":
        print("HERE")
        currk = "c"
        curr_keyskills.append("cpp")
        print(len(curr_keyskills))
        #adjust the percentages
        
    if currk in user_keyskills:
        print("IN: ", currk)
        #apkeyskills = apkeyskills + ptemp
        print("PASS")
            
        
    else:
        continue

12
amazonwebservices
c cpp
HERE
13
IN:  c
PASS
arduino
mongodb
linux
golang
microcontrollers
gobot
internetofthings
matlab
sql
php
cpp
IN:  cpp
PASS


In [76]:
for currk in curr_keyskills:   
    print("C: ",currk)
    if currk in user_keyskills:
        print("PASS: ", currk)
    else:
        print("Fail")

C:  c
PASS:  c
C:  java
PASS:  java


In [116]:
success_rate

[25,
 90.0,
 30,
 30,
 30,
 25,
 70.0,
 25.77,
 25,
 90.0,
 25.77,
 50.0,
 30,
 25,
 30,
 25.77,
 30,
 62.5,
 25,
 30,
 30,
 30,
 90.0,
 25,
 90.0,
 27.73,
 30,
 25,
 50.0,
 30]