In [1]:
# create user object with all information
# define function that separates out user information and provides recommendations based on what is available
# perhaps calibrate n for CB depending on what CF returns
# categories...?

In [2]:
%run CB.ipynb

In [47]:
%run CF.ipynb

Percentage of entries that lack gender information:  5.57 %
951 228


KeyboardInterrupt: 

KeyboardInterrupt: 

# Content-Based Recommender Testing #

<u>Preceeding Steps (CB.ipynb)</u>:
* CB dataframe has been split into CB_train_df and CB_test_df
* TF-IDF vectorizer has been initiated
* vectorizer was fit to the keywords in CB_train_df and TF-IDF matrix was created
* extracted keywords in the test set

### *Define Testing Function* ###

In [44]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

def CB_testing (encoded_course_matrix=tfidf, test_data=test_keywords, vectorizer=vectorizer):
    '''
        Define function for testing the CB recommender given a set of test keywords. The function first 
        transforms test keywords into TF-iDF vectors and then calculates the cosine similarity between
        each test vector and each training vector in the test set, storing the results in the shape of 
        a matrix. The testset defined above is based on CB_test_df in CB.ipynb

        Inputs:
            - tfidf_matrix : matrix created by fitting and transforming the "training set"
                               (supplied input is the matrix based on the transformed CB_train_df in CB.ipynb)
            - test_data    : a (nested) array containing keywords from each of the testsets
                               (supplied input is test_keywords, which have been extracted form CB_test_df)
            - vectorizer   : vectorizer with which to transform test data
                               (supplied fectorizer is the TfidfVectorizer from CB.ipynb)
        
        Outputs:
            - results      : matrix containing a measure of the cosine similarity between each test vector
                             and each training vector
    '''
    
    
    # Define dimensions of the results matrix based on the size of the 
    #  training and test sets
    m = encoded_course_matrix.shape[0]
    n = len(test_keywords)
#     print(m,n)

    # Transform test data into vectors for further processing
    test_vecs = [vectorizer.transform([record[1]]) for record in test_keywords] # encode the second element of the tuple which contains the keywords themselves
    #     print(test_vecs)
            
    # Define vaiable to hold test rezults by specifying its dimensions
    results = np.empty((m,n))   

    
#     print(cosine_similarity(tfidf_matrix[0], test_vecs[0]))
    
    # Calculate the cosine similarity between each training and test vector
    for i, train_vec in enumerate(encoded_course_matrix):
        for j, test_vec in enumerate(test_vecs):
            results[i][j] = cosine_similarity(train_vec, test_vec)[0][0]
            
            
    return results

# Alternate testing function
# def test_cosine_similarity (test_set=test_keywords, encoded_course_matrix=tfidf):
   
#     # Create matrix to store results
#     results = np.empty((tfidf.shape[0], len(test_set)))
    
#     # Encode the test set
#     test_vecs = [encode_query(keywords[1]) for keywords in  test_set] # encode the second element of the tuple which contains the keywords themselves
    
#     # Populate results matrix in accordance with pre-set structure
#     for i, course in enumerate(encoded_course_matrix):
#         for j,testvec in enumerate(test_vecs):
#             results[i][j] = cosine_similarity(course,testvec)[0][0]
    
#     return results

def CB_tabulate_results(results, test_set = test_keywords, training_set = training_keywords, encoded_course_matrix=tfidf, n=10):
    '''
        Display information about each item in the test set and the recommendations issued by the subsystem
        in a quasi-tabular manner for ease of inspection
        
        
    '''
    # Loop through each vector in the test_set
    for i in range(results.shape[1]):
        
        # Isolate the measured closeness to all vectors in the course matrix
        top_10_recs = np.argsort(results[:,i])[:-n-1:-1]
        
        # Output text
        print("-------------------------------------------------------------------------")
        print(f"Test_keywords {i}: ")
        print(test_set[i], CB_test_df.loc[test_set[i][0], "Course Number"],CB_test_df.loc[test_set[i][0], "Institution"])
        
        print("\nRecommendations:")
        for rec in top_10_recs:
#             print(rec)
            
            print(training_set[rec], CB_train_df.iloc[rec, 1]) #, train_df.loc[rec, "Course Number"], train_df.loc[rec, "Institution"])
                                                                                           

def CB_subject_accuracy_at_k(results, test_set=test_keywords, training_set=training_keywords, n=5):
    '''
        Calculate the accuracy with which the recommender chooses courses that have the same subject for any value of n.
        
        Inputs:
        
        Outputs: 
        
    '''
    
    # Define variable to store accuracy per test sample
    accuracy_per_sample = {}
    accuracy_per_sample[n] = {} 
    
    # Number of samples
    n_samples = results.shape[1]
    
    
    # Loop through all the test cases
    for idx in range(n_samples):
        top_n_recs = np.argsort(results[:,idx])[:-n-1:-1]
        
        # Isolate training subject
        target_subject = test_set[idx][1][-1]
#         print(test_set[idx][1])
#         print(f"idx: {idx}, target subject: {target_subject}")
            
        # Define count variable
        count = 0
        for rec in top_n_recs:
#             print(training_set[rec])
#             print(CB_train_df.iloc[rec])
#             print(f"pred: {training_set[rec][1][-1]}")
            if (training_set[rec][1][-1]==target_subject):
                count +=1

        accuracy_per_sample[n][idx] = count/n
        
    return accuracy_per_sample

def CB_precision_at_k(results, k=5):
    
    word_stats_p_at_k = {}
    avg_p_at_k = {}

    for i in range(results.shape[1]):

            # Isolate the measured closeness to all vectors in the course matrix
            top_k_recs = np.argsort(results[:,i])[:-6:-1]
            print(top_k_recs)

            test_words = test_keywords[i][1]
            print(f"Test subject: {test_words}, {len(test_words)}")

            rec_words = [training_keywords[rec][1] for rec in top_k_recs]
    #         print(f"Rec subject: {rec_words}")

            word_stats_p_at_k[i] = []
            # https://stackoverflow.com/questions/1388818/how-can-i-compare-two-lists-in-python-and-return-matches
            for rec in rec_words:
                print(f"Set: {set(test_words) & set(rec)}")
                print(rec)
                word_stats_p_at_k[i].append(len(set(test_words) & set(rec))/len(rec))

            avg_p_at_k[i] = sum(word_stats_p_at_k[i])/k


    return avg_p_at_k
#             print("-------------------------------------------------------------------------------------------------------")
#             # Calculate precision
#             count = [1 if test_subj==rec_subj[i] else 0 for i in range(len(top_k_recs))]

#             stats_at_k[i] = sum(count)



def CB_recall_at_k(results, k=5):
    k = 5
    word_stats_r_at_k = {}
    avg_r_at_k = {}

    for i in range(results.shape[1]):

        # Isolate the measured closeness to all vectors in the course matrix
        top_k_recs = np.argsort(results[:,i])[:-6:-1]
    #         print(top_k_recs)

        test_words = test_keywords[i][1]
        print(f"Test subject: {test_words}, {len(test_words)}")

        rec_words = [training_keywords[rec][1] for rec in top_k_recs]
    #         print(f"Rec subject: {rec_words}")

        word_stats_r_at_k[i] = []
        # https://stackoverflow.com/questions/1388818/how-can-i-compare-two-lists-in-python-and-return-matches
        for rec in rec_words:
            print(len(set(test_words) & set(rec)))
            word_stats_r_at_k[i].append(len(set(test_words) & set(rec))/len(test_words))

        avg_r_at_k[i] = sum(word_stats_r_at_k[i])/5
    #             print(rec)
    
    return avg_r_at_k
            
        
#         print("-------------------------------------------------------------------------------------------------------")
#         # Calculate accuracy
# #         count = [1 if test_subj==rec_subj[i] else 0 for i in range(len(top_k_recs))]
        
# #         stats_at_k[i] = sum(count)
    



In [13]:
results = CB_testing()

In [43]:
CB_precision_at_k(results)

[ 97 142  82 139  10]
Test subject: ['hinduism', 'through', 'world', 'their', 'religions', 'scriptures', 'hhdre'], 7
Set: {'through', 'world', 'their', 'hhdre', 'religions', 'scriptures'}
['through', 'world', 'their', 'christianity', 'religions', 'scriptures', 'hhdre']
Set: {'through', 'world', 'their', 'hhdre', 'religions', 'scriptures'}
['through', 'world', 'their', 'religions', 'buddhism', 'scriptures', 'hhdre']
Set: {'through', 'world', 'their', 'hhdre', 'religions', 'scriptures'}
['through', 'world', 'their', 'religions', 'islam', 'scriptures', 'hhdre']
Set: {'through', 'world', 'their', 'hhdre', 'religions', 'scriptures'}
['literacy', 'through', 'world', 'traditions', 'their', 'religious', 'religions', 'scriptures', 'hhdre']
Set: {'hhdre'}
['justice', 'hhdre']
[ 86  17  42 101 166]
Test subject: ['cellular', 'applications', 'nature', 'solids', 'stem'], 5
Set: {'cellular', 'applications', 'stem', 'solids'}
['cellular', 'applications', 'solids', 'medicine', 'stem']
Set: {'cellular'

{0: 0.7476190476190475,
 1: 0.46952380952380957,
 2: 0.6533333333333332,
 3: 0.44666666666666666,
 4: 0.5071428571428571,
 5: 0.5023809523809525,
 6: 0.5033333333333332,
 7: 0.8388888888888889,
 8: 0.45,
 9: 0.3833333333333333,
 10: 0.4688888888888889,
 11: 0.9199999999999999,
 12: 0.569047619047619,
 13: 0.3333333333333333,
 14: 0.4666666666666666,
 15: 0.6874242424242424,
 16: 0.6599999999999999,
 17: 0.37333333333333335,
 18: 0.73}

In [46]:
CB_recall_at_k(results)

Test subject: ['hinduism', 'through', 'world', 'their', 'religions', 'scriptures', 'hhdre'], 7
6
6
6
6
1
Test subject: ['cellular', 'applications', 'nature', 'solids', 'stem'], 5
4
4
2
1
1
Test subject: ['cellular', 'transformations', 'mechanical', 'behavior', 'materials', 'solids', 'columns', 'beams', 'stress', 'stem'], 10
4
4
3
4
3
Test subject: ['economy', 'empire', 'global', 'hhdre'], 4
2
2
2
1
1
Test subject: ['business', 'transforming', 'society', 'self', 'ghss'], 5
5
2
2
1
1
Test subject: ['symphonie', 'nights', 'fantastique', 'first', 'hhdre'], 5
3
3
1
1
1
Test subject: ['roadmap', 'practical', 'care', 'results', 'science', 'improvement', 'getting', 'health', 'ghss'], 9
3
2
2
2
2
Test subject: ['6-11', 'signs', 'epic', '24', 'greek', 'hours', 'ancient', 'iconography', 'hero', 'hhdre'], 10
7
6
4
4
6
Test subject: ['entrepreneur', 'becoming', 'ghss'], 3
3
1
1
1
1
Test subject: ['revolution', 'einstein', 'hhdre'], 3
1
1
1
1
1
Test subject: ['customer', 'who', 'entrepreneurship', '

{0: 0.7142857142857142,
 1: 0.4800000000000001,
 2: 0.36,
 3: 0.4,
 4: 0.43999999999999995,
 5: 0.36,
 6: 0.24444444444444446,
 7: 0.5399999999999999,
 8: 0.4666666666666666,
 9: 0.3333333333333333,
 10: 0.45714285714285713,
 11: 0.8,
 12: 0.37142857142857144,
 13: 0.3333333333333333,
 14: 0.2333333333333333,
 15: 0.3818181818181819,
 16: 0.6399999999999999,
 17: 0.4,
 18: 0.5}

In [35]:
for i in range(10):
    n=i+1
    accuracy = CB_subject_accuracy_at_k(results, n=n)

    print(accuracy)
    print(f"Average accuracy at {n}: {sum(accuracy[n].values())/len(accuracy[n])}")

{1: {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 1.0}}
Average accuracy at 1: 1.0
{2: {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 0.5, 18: 1.0}}
Average accuracy at 2: 0.9736842105263158
{3: {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 0.6666666666666666, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 0.3333333333333333, 18: 1.0}}
Average accuracy at 3: 0.9473684210526315
{4: {0: 1.0, 1: 0.75, 2: 1.0, 3: 0.75, 4: 1.0, 5: 1.0, 6: 0.75, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 0.5, 18: 1.0}}
Average accuracy at 4: 0.9342105263157895
{5: {0: 1.0, 1: 0.8, 2: 1.0, 3: 0.6, 4: 1.0, 5: 1.0, 6: 0.8, 7: 1.0, 8: 1.0, 9: 1.0, 10: 0.8, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 0.8

In [36]:
CB_tabulate_results(results, n=5)

-------------------------------------------------------------------------
Test_keywords 0: 
(183, ['hinduism', 'through', 'world', 'their', 'religions', 'scriptures', 'hhdre']) HDS3221.5x HarvardX

Recommendations:
(166, ['through', 'world', 'their', 'christianity', 'religions', 'scriptures', 'hhdre']) HDS3221.2x
(173, ['through', 'world', 'their', 'religions', 'buddhism', 'scriptures', 'hhdre']) HDS3221.3x
(178, ['through', 'world', 'their', 'religions', 'islam', 'scriptures', 'hhdre']) HDS3221.4x
(162, ['literacy', 'through', 'world', 'traditions', 'their', 'religious', 'religions', 'scriptures', 'hhdre']) HDS3221.1x
(31, ['justice', 'hhdre']) ER22x
-------------------------------------------------------------------------
Test_keywords 1: 
(174, ['cellular', 'applications', 'nature', 'solids', 'stem']) 3.054.3x MITx

Recommendations:
(168, ['cellular', 'applications', 'solids', 'medicine', 'stem']) 3.054.2x
(159, ['cellular', 'engineering', 'properties', 'applications', 'structures',