In [233]:
%run CB.ipynb

In [4]:
%run CF.ipynb

# Batch Testing #

# Collaborative Filtering Recommender Testing #

## Define Testing Functions ##


In [38]:
def get_hit_ratio(testset, n):
    '''
        Function that calculates the hit ratio for a CF testset. The function does so by gathering n recommendations and 
        checking if the target value is within the list of recommendations.
        
        Inputs:
            -     testset      :  a pre-defined test-set
            -        n         :  the number of recommendations to extract from the recommender
            
        Outputs:
            - top_n_rec_results:  top n recommendations issued by the recommender
            -    hit ratio     :  hit ration calculated for the test set
    '''
    
    # Define place-holders
    hits = 0
    n_records = 0
    top_n_rec_results = {}
    
    # Gather recommendations
    for user in testset.index:
        actual = testset.loc[user, testset.columns[-1]]
        rec_list = CF_recommender(user, n=n, testset=testset.iloc[:,:-1])
        
        # Define hits counter
        hits_per_user = 0
        for course_taken in actual:
            if (course_taken in rec_list):
                hits_per_user +=1
            
            # Add records manually
            #  This approach allows for mixed testsets
            n_records += 1


        # Add to number of hits
        hits += hits_per_user
        
        # Build object
        top_n_rec_results[user] = {"actual": actual,
                                   "recommended": rec_list,
                                   "hits": hits_per_user
                                }
    
    # Process results
    hit_ratio = hits/n_records

    return top_n_rec_results, hit_ratio
        
def tabulate_hit_ratio_n_range (rec_results, n_start, n_stop):
    
    '''
        Function that tabulates the hit ratio for recommender results. Because of the way in which
        recommendations are processed by the CF, it is possible to get the highest nubmer of recommendations 
        needed and simply compute the results for the lower values.
        
        Inputs:
        -  rec_results    :  results issued by the recommender
        -  n_start        :  start value for tabulation
        -  n_stop         :  stop value for tabulation
        
        Outputs:
        - printed hit ratio stats
    '''
    
    for n in range(n_start, n_stop+1):
        print("-------------------------------------------------")
        print(f"Statistics for n={n}")
        
        batch_hit_count = 0
        course_count = 0
        for user in rec_results:   
#             print(user)
            batch_list = list(rec_results[user]["recommended"].keys())[:n]
            for course in rec_results[user]["actual"]:
                # Record count for course completed
                course_count +=1 
                
                if course in batch_list:                    
#                     print(f"{course} in {batch_list}")
                    batch_hit_count +=1 
                
        print(f"Number of hits: {batch_hit_count}")
#         print(f"Number of courses: {course_count}")
        print(f"Hit ratio: {round(batch_hit_count/course_count, 4)}")
    
    print("-------------------------------------------------")
            
def get_MRR_at_n (rec_results, n=10):
    '''
        Function that calculates the MRR score for the CF recommender.
        
        Inputs:
        - rec_results   :  a set of results that have been outputted by the recommender
        -     n         :  the n value for which to calculate the MRR (default is 10)
        
        Outputs:
        -    MRR        :  value of MRR at n
        
        
    '''
    
    # Define list to hold placement of each
    #  relevant recommendation
    RR = []
    
    # Loop through recommendations
    for user in list(rec_results.keys()):
#         print(user)
        # Loop through each of the completed courses
        actual = rec_results[user]["actual"]
        for course in actual:
            found = False # define a found flag
            # Search first n recommended items to find relevant item
            for i in range(n):
                # When relevant item is found, add its reciprocal rank to the list
                if (course == list(rec_results[user]["recommended"].keys())[i]):
                    # Record reciprocal of ranked recommendation
                    found = True
                    RR.append(1/(i+1))
                    
            # If no relevant result is found, append 0
            if not found:
                RR.append(0)
                    
    # Calculate the mean                
    MRR = sum(RR)/len(RR)
    
    return MRR

## Prepare Test Sets ##

### *Test Set A* ###
Filter database for all users who have completed  one course and check predictions by hiding the completed course information and measure the performance of the CF recommender in relation to the actual course completed by each of these users.

In [17]:
testsetA = test_pt[test_pt["sum"]==1].copy()
testsetA

course_id,14.73x,2.01x,3.091x,6.002x,6.00x,7.00x,8.02x,8.MReV,CB22x,CS50x,ER22x,PH207x,PH278x,sum
userid_DI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
H130000071,,,,,,,,,1.0,,,,,1.0
H130000715,,,,,,,,,,1.0,,,,1.0
H130000944,,,,,,,,,,,1.0,,,1.0
H130001577,,,,,,,,,1.0,,,,,1.0
H130002019,,,,,,,,,,,,1.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M130595975,,,,1.0,,,,,,,,,,1.0
M130596958,,,,,1.0,,,,,,,,,1.0
M130597250,,,,,1.0,,,,,,,,,1.0
M130597647,1.0,,,,,,,,,,,,,1.0


In [18]:
user_list_testsetA = list(testsetA.index)
# user_list_testsetA

In [19]:
# Store values before 
testsetA["course_completed_by_user"] = [list(testsetA.columns[np.where(testsetA.loc[user, testsetA.columns[:-1]]==1)[0]]) for user in testsetA.index]

In [20]:
# Change all 1 values to 0
for col in testsetA.columns[:-1]:
    testsetA[col] = np.where(testsetA[col]==1, 0, testsetA[col])

In [21]:
testsetA

course_id,14.73x,2.01x,3.091x,6.002x,6.00x,7.00x,8.02x,8.MReV,CB22x,CS50x,ER22x,PH207x,PH278x,sum,course_completed_by_user
userid_DI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
H130000071,,,,,,,,,0.0,,,,,0.0,[CB22x]
H130000715,,,,,,,,,,0.0,,,,0.0,[CS50x]
H130000944,,,,,,,,,,,0.0,,,0.0,[ER22x]
H130001577,,,,,,,,,0.0,,,,,0.0,[CB22x]
H130002019,,,,,,,,,,,,0.0,,0.0,[PH207x]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M130595975,,,,0.0,,,,,,,,,,0.0,[6.002x]
M130596958,,,,,0.0,,,,,,,,,0.0,[6.00x]
M130597250,,,,,0.0,,,,,,,,,0.0,[6.00x]
M130597647,0.0,,,,,,,,,,,,,0.0,[14.73x]


In [22]:
# testsetA

### *Test Set B* ###
Filter database of all users who have completed two courses (i.e., the maximum number of courses completed by any user in the test) and check predictions in two ways:
1) hide all courses and check recommendations for two course
2) hide one course at a time and check recommendations

In [214]:
testsetB = test_pt[test_pt["sum"]==2].copy()

In [24]:
testsetB_user_list = testsetB.index
# testsetB_user_list

In [25]:
testsetB

course_id,14.73x,2.01x,3.091x,6.002x,6.00x,7.00x,8.02x,8.MReV,CB22x,CS50x,ER22x,PH207x,PH278x,sum
userid_DI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
H130002196,,,,,,,,,,,1.0,,1.0,2.0
H130011763,,,,,,,,,,,1.0,,1.0,2.0
H130020479,,,,,,,,,,,1.0,,1.0,2.0
H130030300,,,,,,,,,,,1.0,,1.0,2.0
H130030879,,,,,,,,,,,1.0,,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M130556674,,1.0,1.0,,,,,,,,,,,2.0
M130566106,,,,,1.0,,1.0,,,,,,,2.0
M130570542,,,,,,,1.0,1.0,,,,,,2.0
M130573400,,,,1.0,,,1.0,,,,,,,2.0


In [26]:
# Store values before rewriting table
courses_selected = [list(testsetB.columns[np.where(testsetB.loc[user, testsetB.columns[:-1]]==1)[0]]) for user in testsetB_user_list]

# Transform course information into np.array for ease of processing
# courses_selected = np.array(courses_selected) 

# Check output
# courses_selected

In [27]:
# Add course information to dataframe before modifying values
testsetB["actual_course"] = courses_selected
# testsetB["actual_course_2"] = courses_selected[:, 1]

In [28]:
# Change all non-zero values to 0
for col in testsetB.columns[:-1]:
    testsetB[col] = np.where((testsetB[col]==1) | (testsetB[col]==2), 0, testsetB[col])

In [29]:
testsetB

course_id,14.73x,2.01x,3.091x,6.002x,6.00x,7.00x,8.02x,8.MReV,CB22x,CS50x,ER22x,PH207x,PH278x,sum,actual_course
userid_DI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
H130002196,,,,,,,,,,,0.0,,0.0,0.0,"[ER22x, PH278x]"
H130011763,,,,,,,,,,,0.0,,0.0,0.0,"[ER22x, PH278x]"
H130020479,,,,,,,,,,,0.0,,0.0,0.0,"[ER22x, PH278x]"
H130030300,,,,,,,,,,,0.0,,0.0,0.0,"[ER22x, PH278x]"
H130030879,,,,,,,,,,,0.0,,0.0,0.0,"[ER22x, PH278x]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M130556674,,0.0,0.0,,,,,,,,,,,0.0,"[2.01x, 3.091x]"
M130566106,,,,,0.0,,0.0,,,,,,,0.0,"[6.00x, 8.02x]"
M130570542,,,,,,,0.0,0.0,,,,,,0.0,"[8.02x, 8.MReV]"
M130573400,,,,0.0,,,0.0,,,,,,,0.0,"[6.002x, 8.02x]"



### *Calculate Hit Ratio for Testset A and Tabulate Results* ###

In [31]:
testsetA_full_recs, testsetA_overall_hit_ratio = get_hit_ratio(testsetA, n=10)

In [47]:
get_MRR_at_n(testsetA_full_recs, n=10)

0.5225256679767233

In [48]:
get_MRR_at_n(testsetA_full_recs, n=5)

0.49973128598848454

In [49]:
get_MRR_at_n(testsetB_full_recs, n=10)

0.37235989633948846

In [50]:
get_MRR_at_n(testsetB_full_recs, n=5)

0.3267857142857144

In [35]:
tabulate_hit_ratio_n_range(testsetA_full_recs, 1,10)

-------------------------------------------------
Statistics for n=1
Number of hits: 864
Hit ratio: 0.3317
-------------------------------------------------
Statistics for n=2
Number of hits: 1340
Hit ratio: 0.5144
-------------------------------------------------
Statistics for n=3
Number of hits: 1655
Hit ratio: 0.6353
-------------------------------------------------
Statistics for n=4
Number of hits: 1879
Hit ratio: 0.7213
-------------------------------------------------
Statistics for n=5
Number of hits: 2073
Hit ratio: 0.7958
-------------------------------------------------
Statistics for n=6
Number of hits: 2212
Hit ratio: 0.8491
-------------------------------------------------
Statistics for n=7
Number of hits: 2316
Hit ratio: 0.8891
-------------------------------------------------
Statistics for n=8
Number of hits: 2408
Hit ratio: 0.9244
-------------------------------------------------
Statistics for n=9
Number of hits: 2458
Hit ratio: 0.9436
-----------------------------

### *Calculate Hit Ratio for Testset B* ###

In [51]:
testsetB_full_recs, testsetB_overall_hit_ratio = get_hit_ratio(testsetB, n=10)

In [52]:
tabulate_hit_ratio_n_range(testsetB_full_recs, 2,10)

-------------------------------------------------
Statistics for n=2
Number of hits: 63
Hit ratio: 0.3214
-------------------------------------------------
Statistics for n=3
Number of hits: 81
Hit ratio: 0.4133
-------------------------------------------------
Statistics for n=4
Number of hits: 94
Hit ratio: 0.4796
-------------------------------------------------
Statistics for n=5
Number of hits: 113
Hit ratio: 0.5765
-------------------------------------------------
Statistics for n=6
Number of hits: 127
Hit ratio: 0.648
-------------------------------------------------
Statistics for n=7
Number of hits: 145
Hit ratio: 0.7398
-------------------------------------------------
Statistics for n=8
Number of hits: 163
Hit ratio: 0.8316
-------------------------------------------------
Statistics for n=9
Number of hits: 170
Hit ratio: 0.8673
-------------------------------------------------
Statistics for n=10
Number of hits: 180
Hit ratio: 0.9184
----------------------------------------

In [215]:
test_pt[test_pt["sum"]==0]

course_id,14.73x,2.01x,3.091x,6.002x,6.00x,7.00x,8.02x,8.MReV,CB22x,CS50x,ER22x,PH207x,PH278x,sum
userid_DI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
H130000016,,,,,,,,,0.0,,,,,0.0
H130000021,,,,,,,,,0.0,,,,,0.0
H130000032,,,,,,,,,,0.0,,,,0.0
H130000035,,,,,,,,,,,,,0.0,0.0
H130000059,,,,,,,,,,,,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M130597645,0.0,,,,,0.0,,,,,,,,0.0
M130597652,,,,0.0,,,,,,,,,,0.0
M130597662,,,,0.0,,,,,,,,,,0.0
M130597666,,,,0.0,,,,,,,,,,0.0


# Content-Based Recommender Testing #

<u>Preceeding Steps (CB.ipynb)</u>:
* CB dataframe has been split into CB_train_df and CB_test_df
* TF-IDF vectorizer has been initiated
* vectorizer was fit to the keywords in CB_train_df and TF-IDF matrix was created
* extracted keywords in the test set

### *Define Testing Function* ###

In [56]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

def CB_testing (encoded_course_matrix=tfidf, test_data=test_keywords, vectorizer=vectorizer):
    '''
        Define function for testing the CB recommender given a set of test keywords. The function first 
        transforms test keywords into TF-iDF vectors and then calculates the cosine similarity between
        each test vector and each training vector in the test set, storing the results in the shape of 
        a matrix. The testset defined above is based on CB_test_df in CB.ipynb

        Inputs:
            - tfidf_matrix : matrix created by fitting and transforming the "training set"
                               (supplied input is the matrix based on the transformed CB_train_df in CB.ipynb)
            - test_data    : a (nested) array containing keywords from each of the testsets
                               (supplied input is test_keywords, which have been extracted form CB_test_df)
            - vectorizer   : vectorizer with which to transform test data
                               (supplied fectorizer is the TfidfVectorizer from CB.ipynb)
        
        Outputs:
            - results      : matrix containing a measure of the cosine similarity between each test vector
                             and each training vector
    '''
    
    
    # Define dimensions of the results matrix based on the size of the 
    #  training and test sets
    m = encoded_course_matrix.shape[0]
    n = len(test_keywords)
#     print(m,n)

    # Transform test data into vectors for further processing
    test_vecs = [vectorizer.transform([record[1]]) for record in test_keywords] # encode the second element of the tuple which contains the keywords themselves
    #     print(test_vecs)
            
    # Define vaiable to hold test rezults by specifying its dimensions
    results = np.empty((m,n))   

    
#     print(cosine_similarity(tfidf_matrix[0], test_vecs[0]))
    
    # Calculate the cosine similarity between each training and test vector
    for i, train_vec in enumerate(encoded_course_matrix):
        for j, test_vec in enumerate(test_vecs):
            results[i][j] = cosine_similarity(train_vec, test_vec)[0][0]
            
            
    return results

# Alternate testing function
# def test_cosine_similarity (test_set=test_keywords, encoded_course_matrix=tfidf):
   
#     # Create matrix to store results
#     results = np.empty((tfidf.shape[0], len(test_set)))
    
#     # Encode the test set
#     test_vecs = [encode_query(keywords[1]) for keywords in  test_set] # encode the second element of the tuple which contains the keywords themselves
    
#     # Populate results matrix in accordance with pre-set structure
#     for i, course in enumerate(encoded_course_matrix):
#         for j,testvec in enumerate(test_vecs):
#             results[i][j] = cosine_similarity(course,testvec)[0][0]
    
#     return results

def CB_tabulate_results(results, test_set = test_keywords, training_set = training_keywords, encoded_course_matrix=tfidf, n=10):
    '''
        Display information about each item in the test set and the recommendations issued by the subsystem
        in a quasi-tabular manner for ease of inspection
        
        
    '''
    # Loop through each vector in the test_set
    for i in range(results.shape[1]):
        
        # Isolate the measured closeness to all vectors in the course matrix
        top_10_recs = np.argsort(results[:,i])[:-n-1:-1]
        
        # Output text
        print("-------------------------------------------------------------------------")
        print(f"Test_keywords {i}: ")
        print(test_set[i], CB_test_df.loc[test_set[i][0], "Course Number"],CB_test_df.loc[test_set[i][0], "Institution"])
        
        print("\nRecommendations:")
        for rec in top_10_recs:
#             print(rec)
            
            print(training_set[rec], CB_train_df.iloc[rec, 1]) #, train_df.loc[rec, "Course Number"], train_df.loc[rec, "Institution"])
                                                                                           

def CB_subject_accuracy_at_k(results, test_set=test_keywords, training_set=training_keywords, n=5):
    '''
        Calculate the accuracy with which the recommender chooses courses that have the same subject for any value of n.
        
        Inputs:
            -   results  :  matrix containing recommender results
            -   test set :  set of keywords that function as a test set
            -       n    :  number of recommendations to issues
        
        Outputs: 
            -   accuracy_per_sample : the average accuracy for a set of inputs 
        
    '''
    
    # Define variable to store accuracy per test sample
    accuracy_per_sample = {}
    accuracy_per_sample[n] = {} 
    
    # Number of samples
    n_samples = results.shape[1]
    
    
    # Loop through all the test cases
    for idx in range(n_samples):
        top_n_recs = np.argsort(results[:,idx])[:-n-1:-1]
        
        # Isolate training subject
        target_subject = test_set[idx][1][-1]
#         print(test_set[idx][1])
#         print(f"idx: {idx}, target subject: {target_subject}")
            
        # Define count variable
        count = 0
        for rec in top_n_recs:
#             print(training_set[rec])
#             print(CB_train_df.iloc[rec])
#             print(f"pred: {training_set[rec][1][-1]}")
            if (training_set[rec][1][-1]==target_subject):
                count +=1

        accuracy_per_sample[n][idx] = count/n
        
    return accuracy_per_sample

def CB_precision_at_k(results, k=5):
    # of the number of items recommended, how many are relevant
    '''
        This function adapts the precision @ k measure in order to provide a measure of the relevance of each recommendation
        by calculating the precision of individual keywords and then averaging it out across all recommendations. In short, 
        this function measures the relevance of each recommendation by computing the number of relevant keywords it contains
        in relation to the query and then averaging out this measure across the entire query set.
    '''
    word_stats_p_at_k = {}
    avg_p_at_k = {}

    # Loop through results
    for i in range(results.shape[1]):

            # Isolate the measured closeness to all vectors in the course matrix
            top_k_recs = np.argsort(results[:,i])[:-k-1:-1]
#             print(top_k_recs)
            
            # Extract keywords in query
            test_words = test_keywords[i][1]
#             print(f"Test subject: {test_words}, {len(test_words)}")

            # Extract keywords in recommendation
            rec_words = [training_keywords[rec][1] for rec in top_k_recs]
    #         print(f"Rec subject: {rec_words}")
            
            # Create variable to hold results
            word_stats_p_at_k[i] = []
            # https://stackoverflow.com/questions/1388818/how-can-i-compare-two-lists-in-python-and-return-matches
            for rec in rec_words:
#                 print(f"Set: {set(test_words) & set(rec)}")
#                 print(rec)
                # Count the number of words that are relevant of the entire number of words
                #  returned in each recommendation
                word_stats_p_at_k[i].append(len(set(test_words) & set(rec))/len(rec))
            
            # Average out this measure across k recommendations
            avg_p_at_k[i] = sum(word_stats_p_at_k[i])/k
   
    return avg_p_at_k



### *Process Test Set* ###

In [42]:
# Process similarity of test keywords to known keywords
results = CB_testing()

### *Calculate Percetange of Correct Subject Attirbution* ###
Because the courses in the test and training sets are different, the keywords for one course will never match another in their entirety unless a course has been acidentally miscategorized using a different course code or unless its course code has been modified over time. (These are very few instances in this paritcular dataset where it appears that the course providers decided to make a slight modification the course code of a subsequent offering.) However, it seems to me to be nevertheless useful to check the extent to which the keyword search yields courses in the same category because it gives us a measure not only of the extent to which the algoirthm works to narrow down recommendations but also the way in which this algorithm includes elements of serendipity.

In [43]:
for i in range(10):
    n=i+1
    accuracy = CB_subject_accuracy_at_k(results, n=n)

#     print(accuracy)
    print(f"Average accuracy at {n}: {sum(accuracy[n].values())/len(accuracy[n])}")

Average accuracy at 1: 1.0
Average accuracy at 2: 0.9736842105263158
Average accuracy at 3: 0.9473684210526315
Average accuracy at 4: 0.9210526315789473
Average accuracy at 5: 0.9263157894736843
Average accuracy at 6: 0.9210526315789473
Average accuracy at 7: 0.9172932330827067
Average accuracy at 8: 0.9013157894736842
Average accuracy at 9: 0.9005847953216374
Average accuracy at 10: 0.9052631578947368


The decrease in this accuracy measure tells us, in effect, that the correct categories emerge within the first few recommendations and that opening up the recommender to more options can make it perform less optimally. Another way to phrase this would be to say that as the number of recommendations we ask it to generate increases, the system introduces more serendipity.

### *Precision at K* ###
Because a CB subsystem focused on finding proximity of keywords returns items that are relevant on a scale, I have decided to use precision as a way to measure the relevance of what is returned, but to do so by calculating the precision of the words returned by every recommendation and to average that out over *k* recommendations.

In [64]:
for k in range(10):    
    print("-------------------------------------------------------------------------")
    print(f"Precision at {k+1} for each sample:")
    p = CB_precision_at_k(results, k+1)
    
    print(p,'\n')
    avg_p = sum(p.values())/len(p.values())
    print(f"Average precision at k={k+1}: {avg_p}")
    

-------------------------------------------------------------------------
Precision at 1 for each sample:
{0: 0.8571428571428571, 1: 0.8, 2: 1.0, 3: 0.4, 4: 1.0, 5: 0.75, 6: 0.75, 7: 0.7777777777777778, 8: 1.0, 9: 0.5, 10: 0.4444444444444444, 11: 1.0, 12: 0.4444444444444444, 13: 0.3333333333333333, 14: 0.6666666666666666, 15: 0.5454545454545454, 16: 1.0, 17: 0.5, 18: 1.0} 

Average precision at k=1: 0.7246981089086352
-------------------------------------------------------------------------
Precision at 2 for each sample:
{0: 0.8571428571428571, 1: 0.6857142857142857, 2: 0.8333333333333333, 3: 0.3666666666666667, 4: 0.8333333333333333, 5: 0.5892857142857143, 6: 0.7083333333333333, 7: 0.7638888888888888, 8: 0.6666666666666666, 9: 0.5, 10: 0.4222222222222222, 11: 1.0, 12: 0.4222222222222222, 13: 0.3333333333333333, 14: 0.6666666666666666, 15: 0.5852272727272727, 16: 1.0, 17: 0.35, 18: 1.0} 

Average precision at k=2: 0.6623177261335156
----------------------------------------------------

### Full Result Printout ###

In [46]:
CB_tabulate_results(results, n=5)

-------------------------------------------------------------------------
Test_keywords 0: 
(183, ['religions', 'scriptures', 'hinduism', 'through', 'their', 'world', 'hhdre']) HDS3221.5x HarvardX

Recommendations:
(166, ['religions', 'christianity', 'scriptures', 'through', 'their', 'world', 'hhdre']) HDS3221.2x
(173, ['religions', 'scriptures', 'through', 'their', 'world', 'buddhism', 'hhdre']) HDS3221.3x
(178, ['religions', 'islam', 'scriptures', 'through', 'their', 'world', 'hhdre']) HDS3221.4x
(162, ['religions', 'traditions', 'religious', 'literacy', 'scriptures', 'through', 'their', 'world', 'hhdre']) HDS3221.1x
(55, ['justice', 'hhdre']) ER22x
-------------------------------------------------------------------------
Test_keywords 1: 
(174, ['solids', 'applications', 'nature', 'cellular', 'stem']) 3.054.3x MITx

Recommendations:
(168, ['solids', 'medicine', 'applications', 'cellular', 'stem']) 3.054.2x
(159, ['properties', 'solids', 'applications', 'structures', 'engineering', '

# Recommender Interface #

In [325]:
# User template
user_template = {
    
    # CF Component
    "CF" : {    
#             // userid that is not in the system
            },
    # CB Component
    "CB":{
#             "subject": "",
#             "keywords": []

        }
}

### *Define 3 Sample Users* ###

In [326]:
test_user1 = {
    "CF":{},
    "CB":{}
}

test_user2 = {
    "CF":{
      "userid": "M130597645"
    },
    "CB":{
          "subject":"ghss",
        "keywords": []
    }
}

test_user3 = {
    "CF":{
          "userid": "M130597666"
    },
    "CB":{
          "subject":"cs",
        "keywords": ["programming", "python"]
    }
}

In [318]:
def access_recommender(user_object):
    '''
        Function that can access both elements of the hybrid recommender and does so 
        based on which information is provided by the user.
        
        Inputs:
        - user_object: information about the user
        
        Otputs:
        - printout of recommendations
    '''
    

    user_object = user_object.copy()
    
    # User has stated no preference
    if (user_object["CB"] == {}):
        # Supply blank subject and keyword categories in order
        #   to elicit recommendations based on stats
        user_object["CB"] = {
            "subject": "",
            "keywords": []
        }
        print(f"Courses with highest completion rates on our system: {CB_recommender(user_object['CB'])}")
        for course in CB_recommender(user_object['CB']):
            print(CB_train_df.loc[CB_train_df["Course Number"]==course, CB_train_df.columns[1:3]])
  
    elif user_object["CB"]["keywords"]==[]:
        courses = CB_recommender(user_object['CB'])
        print(f"Courses with highest completeion rates in your chosen subject: {courses[courses.columns[1:3]]}")
    
    else:
        idxs = CB_recommender(user_object['CB'])
        print(f"Courses that might be of interest:")
        print(CB_train_df.iloc[idxs, 1:3])
#         print(CB_recs)
        
    
    # Cold-start scenario
    if (user_object["CF"] == {}):
        pass
    else:
        print(f"Your personalized course recommendations are as follows: {CF_recommender(user_object['CF']['userid'])}")

        

### *Define 3 Sample Users* ###

In [330]:
access_recommender(test_user1)

Courses with highest completion rates on our system: {'HUM1.7x': None, '1368.3x': None, 'HUM2.3x': None, '1368.4x': None, 'GOV1368.3x': None}
    Course Number                                       Course Title
133       HUM1.7x  History of the Book: Monasteries, Schools, and...
    Course Number                                       Course Title
104       1368.3x  Saving Schools: History, Politics, and Policy ...
   Course Number                                       Course Title
97       HUM2.3x  The Ancient Greek Hero in 24 Hours (Hours 12-1...
    Course Number                                       Course Title
108       1368.4x  Saving Schools: History, Politics, and Policy ...
    Course Number                                       Course Title
142    GOV1368.3x  Saving Schools: History, Politics, and Policy ...


In [331]:
access_recommender(test_user2)

Courses with highest completeion rates in your chosen subject:     Course Number                                       Course Title
153        PH557x   Lessons from Ebola: Preventing the Next Pandemic
93         PH555x  Improving Global Health: Focusing on Quality a...
76         PH201x                                 Health and Society
61          SW25x  Global Health: Case Studies from a Biosocial P...
86       MAS.S69x                        Big Data and Social Physics
Your personalized course recommendations are as follows: {'ER22x': (44.0, 1, 0.22797927461139897), '6.00x': (37.0, 2, 0.19170984455958548), '14.73x': (35.0, 3, 0.18134715025906736), 'CB22x': (14.0, 4, 0.07253886010362694), '3.091x': (14.0, 5, 0.07253886010362694)}


In [332]:
access_recommender(test_user3)

tokesn 1: ['programming', 'python']
tokens: ['programming', 'python', 'cs']
Courses that might be of interest:
    Course Number                                      Course Title
0         6.00.1x  Introduction to Computer Science and Programming
4           CS50x                  Introduction to Computer Science
14       21W.789x                       Building Mobile Experiences
150      6.004.2x     Computation Structures: Computer Architecture
49       6.004.1x          Computation Structures: Digital Circuits
Your personalized course recommendations are as follows: {'PH207x': (41.0, 1, 0.23563218390804597), '6.002x': (38.0, 2, 0.21839080459770116), '14.73x': (23.0, 3, 0.13218390804597702), 'ER22x': (18.0, 4, 0.10344827586206896), 'CS50x': (9.0, 5, 0.05172413793103448)}
