In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, cosine_distances
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import squarify
import seaborn as sns
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import train_test_split
import random

## Calling the Saved Dataset

In [2]:
course_dataset = pd.read_csv("../data/courseData.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [3]:
user_dataset = pd.read_csv("../data/userData.csv", encoding= 'unicode_escape')
user_dataset.head()

Unnamed: 0.1,Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,0,1001,B.E.,Computer Science & Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1,1002,B.E.,Computer Science & Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,2,1003,B.E.,Computer Science & Engineering,['Missing'],['Missing'],Missing,Missing
3,3,1004,B.E.,Computer Science & Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,4,1005,B.E.,Computer Science & Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [4]:
ratings_df = pd.read_csv("../data/ratingData.csv", encoding= 'unicode_escape')
ratings_df.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


## Collaborative Filtering

In [5]:
#using surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['user_id','course_id','rating']], reader)

#split into training and test sets using 80:20 ratio
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)                

ratio = int(len(raw_ratings)*0.8)
train_raw = raw_ratings[:ratio] 
test_raw = raw_ratings[ratio:] 

data.raw_ratings = train_raw       
trainset = data.build_full_trainset() 
testset = data.construct_testset(test_raw)

####  SVD CF Recommender

In [6]:
def svd_cf_recommendations(user):
    
    #usisng the "best" parameters found using the gridsearchcv method from experiments notebook 
    model = SVD(n_factors = 90, n_epochs = 20, lr_all = 0.005, reg_all = 0.2)
    model.fit(trainset)
    
    #building test set and predict the ratings
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    
    #save into dataframe
    rating_predictions = pd.DataFrame(predictions)
    
    #find the top 30 predictions for given user
    user_predictions = rating_predictions[rating_predictions['uid'] == user].\
                         sort_values(by="est", ascending = False).head(30)
    
    #create recommendations array and append user predictions as recommendations
    recommendations = []
    recommendations.append(list(user_predictions['iid']))
    recommendations = recommendations[0]
    
    return(recommendations)

## Content Based Filtering

In [7]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
cf_user_dataset = pd.DataFrame()
cf_user_dataset['userid'] = user_dataset['userid']
cf_user_dataset['degree_1'] = user_dataset['degree_1']
cf_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
cf_user_dataset['career_objective'] = user_dataset['career_objective']
cf_user_dataset['key_skills_str'] = user_dataset['key_skills_str']
cf_user_dataset.head(5)

#store the career objective and key skills in description
cf_user_dataset['description'] = cf_user_dataset['degree_1'] + cf_user_dataset['degree_1_specializations'] + cf_user_dataset['career_objective'] + cf_user_dataset['key_skills_str']
cf_user_dataset.head(5)

#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
cf_course_dataset = pd.DataFrame()
cf_course_dataset['sr_'] = course_dataset['sr_']
cf_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
cf_course_dataset['degree_1'] = course_dataset['degree_1']
cf_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
cf_course_dataset['key_skills_str'] = course_dataset['key_skills_str']
cf_course_dataset.head(5)

#store the career objective and key skills in description
cf_course_dataset['description'] = cf_course_dataset['degree_1'] + cf_course_dataset['degree_1_specializations'] + cf_course_dataset['key_skills_str']
cf_course_dataset.head(5)

#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = cf_user_dataset['description'] + cf_course_dataset['description']

In [8]:
#vectorize using countvectorize that converts into a matrix of token counts
comb_count = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2), min_df=0)
comb_count_matrix = comb_count.fit_transform(comb['description'].values.astype('U'))

#cosine similarity between the the combined matrix itself (normalized dot product of X and Y)
course_cosine_sim = cosine_similarity(comb_count_matrix, comb_count_matrix)

#create indices for the courses is using series
cf_course_dataset = cf_course_dataset.reset_index()
courses = cf_course_dataset
indices = pd.Series(cf_course_dataset.index, index=cf_course_dataset['sr_'])

#function to get content-filtered recommendations
def get_course_cf_recommendations(user): 
    
    #get index of course
    idx = indices[user]
    
    #find the most similar 30 courses using cosine_sim and sorting with highest similarity
    sim_scores = list(enumerate(course_cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    course_indices = [i[0] for i in sim_scores]
    
    return courses.iloc[course_indices]

## The Predictor 

In [9]:
#implement predictor using the user key skills and info and the predicted courses
def calculate_success_rate(user, recommendations):
    success_rate = []
    
    #get the users required info
    #user_details = user_dataset[user_dataset["userid"] == user]
    user_degree = user_dataset["degree_1"].loc[user_dataset["userid"] == user].values[0]
    user_spec = user_dataset["degree_1_specializations"].loc[user_dataset["userid"] == user].values[0]
    user_keyskills = user_dataset["key_skills_str"].loc[user_dataset["userid"] == user].values[0]

    #turn into list 
    user_degree = user_degree.split("-")
    user_spec = user_spec.split("-")
    user_keyskills = user_keyskills.split(",")
    user_keyskills = (map(lambda x: x.lower().strip(), user_keyskills))
    user_keyskills = list(user_keyskills)
    

    #get the recommmendations
    #compare the user details with the recommendations
    for i, row in recommendations.iterrows():
        #print course code
        #print("Checking Course ", i)
        
        #reset the percentages
        apdegree = 0
        apspec = 0
        apkeyskills = 0
        
        #get current course data
        curr_degree = row[1]
        curr_spec = row[2]
        curr_keyskills = row[4] #row 3 using final, might change to row 2 if we remove campus
        
        #turn into list 
        curr_degree = curr_degree.split("-")
        curr_spec = curr_spec.split("-")
        #curr_keyskills = curr_keyskills.split(",")
        curr_keyskills = curr_keyskills.strip().split(",")
        curr_keyskills = (map(lambda x: x.lower().strip(), curr_keyskills))
        curr_keyskills = list(curr_keyskills)

        #get percentages
        #pdegree, pspec, pkeyskills = get_percentage(len(curr_keyskills))
        
        #the percentages according to the academic success
        pdegree = 15
        pspec = 35
        pkeyskills = 50
        
        #compare degree
        if (user_degree == curr_degree):
            apdegree = pdegree
            
        #compare degree specs
        if (user_spec == curr_spec):
            
            apspec = pspec
        
        #compare keyskills 
        ptemp = pkeyskills / len(curr_keyskills) #this will be the percentages of each individual key skill
                
        for currk in curr_keyskills:   
            
            #in the case of missing currk
            if currk == "missing":
                print("There is not enough information!")

            if currk == "c cpp":
                currk = "c"
                curr_keyskills.append("cpp")
                
                #adjust the percentages
                ptemp = pkeyskills / len(curr_keyskills)
                
            #add new if for cprogrammming same as c
            if currk == "cprogramming":
                currk = "c"

            if currk in user_keyskills:
                apkeyskills = apkeyskills + ptemp
                
        #calculate the total success rate and round to 2dp
        total = round(apdegree + apspec + apkeyskills, 2)
        
        #print("The Percentages: ", pdegree, pspec, pkeyskills)
        #print("Course: ", i, "SuccessRate: ", total)
        success_rate.append(total)
                    
    return success_rate

In [10]:
#gets course details of svd recommendations
def get_course_details(svd_courses): 
    
    #dataframe for final recommendations 
    finalRecs = pd.DataFrame()
    
    #iterate the list
    for i in svd_courses:
        
        #if course found in couse dataset then store details in finalRecs dataframe
        if i in course_dataset["sr_"]:
            course_details = course_dataset[course_dataset["sr_"] == i]
            finalRecs = finalRecs.append(course_details)

            #drop unneccessary columns 
            finalRecs.drop(["Unnamed: 0", "key_skills"], axis = 1, inplace = True)
        
    #return useful information
    return finalRecs[["sr_", "degree_1", "degree_1_specializations", "campus", "key_skills_str"]]

In [11]:
#removes courses that the user has already taken by checking the ratings in the ratings dataframe
def drop_courses_taken(user, cf):
    finalcourserecs = [] #to store the final course recs

    for i, row in cf.iterrows():
        #print(row[1])

        if row[1] in ratings_df["course_id"].values:
            #print("Course is found in the ratings dataframe!")
            #check if rated
            rating_dets = ratings_df[(ratings_df['course_id'] == row[1]) & (ratings_df['user_id'] == user)]
            #print(rating_dets)
            if rating_dets["rating"].values[0] != 0:
                cf.drop([i], inplace=True)
                finalcourserecs.append(row[1])
    return cf

## The Hybrid Recommender 

In [12]:
def hybrid_recommender(user):
    
    svd_courses = [] #pd.DataFrame()
    user_recs = pd.DataFrame()
    course_recs = pd.DataFrame()
    
    #get collaborative filtering recommendations
    svd_courses = svd_cf_recommendations(user)
    
    #get content based filtering recommendations
    cf_recs = get_course_cf_recommendations(user)
    
    #adjust columns as necessary
    cf_recs = cf_recs.loc[:, ["sr_","degree_1","degree_1_specializations","campus", "key_skills_str", "index","description"]]

    #if there are no similar users with ratings for the svd, recommendations will be made solely on content-based filtering
    if len(svd_courses) == 0:
        #there are no svd recs
        have_svd = False
        print("No SVD Recs")
    else:
        #get course details of svd recs
        have_svd = True
        
        #check and drop courses if taken by user 
        cf_recs = drop_courses_taken(user, cf_recs)
        svd_recs = get_course_details(svd_courses)
        
    #making a filter column to show where the recommendations were generated from
    cf_recs["Filter"] = "Content-Based"
    
    #calculate the success rates and save in Success Rate column
    cf_success_rate = calculate_success_rate(str(user), cf_recs)     
    cf_recs['Success Rate'] = cf_success_rate
    
    #sort to get the courses with the highest success rate
    cf_recs = cf_recs.sort_values(by=['Success Rate'], ascending=False) 
    
    #displaying
    if have_svd == True:
        
        svd_recs["Filter"] = "Collaborative"
        
        #calculate svd success rate
        svd_success_rate = calculate_success_rate(str(user), svd_recs) 
        svd_recs['Success Rate'] = svd_success_rate 
        
        svd_recs = svd_recs.sort_values(by=['Success Rate'], ascending=False)
    
        #get final recommendations using concat to get the top 5 recs from both recommenders
        final = pd.concat([cf_recs.iloc[:5], svd_recs.iloc[:5]], ignore_index=True, sort=False)
    else:
        final = cf_recs.head(10)
        
    #rename index and set index to it
    final = final.rename(columns = {final.columns[0]: "Course Code"})
    final.set_index("Course Code", inplace = True)

    #drop unnecessary colummns
    final.drop(["index", "description"], axis = 1, inplace = True)

    #rename other columns
    final.rename(columns = {"degree_1":"Degree", "degree_1_specializations": "Degree Specializations", "campus": "Campus", "key_skills": "Key Skills Actual", "key_skills_str": "Key Skills", "Filter": "Filter"}, inplace = True)

    print("Course Recommmendations for User", user, ":")
    final = final.reset_index()

    return final

In [13]:
final = hybrid_recommender(1001)
final

Course Recommmendations for User 1001 :


Unnamed: 0,Course Code,Degree,Degree Specializations,Campus,Key Skills,Filter,Success Rate
0,1895,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
1,1394,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
2,1934,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
3,1188,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
4,1957,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Content-Based,91.67
5,2218,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
6,2383,B.E.,Computer Science & Engineering,MITAOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
7,2140,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
8,2403,B.E.,Computer Science & Engineering,MITCOE,"C, Java, CPP, HTML, CMStool, MYSQL",Collaborative,91.67
9,2419,B.E.,Computer Science & Engineering,MITCOE,"CPPProgramming, Core JAVA, CProgramming, MySql",Collaborative,75.0


In [14]:
course_dataset[course_dataset["sr_"] == 1895]

Unnamed: 0.1,Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
894,894,1895,B.E.,Computer Science & Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL"


### Get the course details for the collaborative filtering recs

In [15]:
calculate_success_rate("1001", final) 

[91.67, 91.67, 91.67, 91.67, 91.67, 91.67, 91.67, 91.67, 91.67, 75.0]

All of the success rate prediction results where checked manually for the user "1001". An image of the course recommendations and the the success rates will be saved for future reference if needed. Everything was calculated as desried