In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Import Data #

In [2]:
# Load data and output first five rows
df = pd.read_csv("../Datasets/courses.csv")
df.head()

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Honor Code Certificates,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),...,% Certified of > 50% Course Content Accessed,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Total Course Hours (Thousands),Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
0,MITx,6.002x,09/05/2012,Circuits and Electronics,Khurram Afridi,"Science, Technology, Engineering, and Mathematics",1,1,36105,5431,...,54.98,83.2,8.17,28.97,418.94,64.45,26.0,88.28,11.72,60.68
1,MITx,6.00x,09/26/2012,Introduction to Computer Science and Programming,"Eric Grimson, John Guttag, Chris Terman",Computer Science,1,1,62709,8949,...,64.05,89.14,14.38,39.5,884.04,78.53,28.0,83.5,16.5,63.04
2,MITx,3.091x,10/09/2012,Introduction to Solid State Chemistry,Michael Cima,"Science, Technology, Engineering, and Mathematics",1,1,16663,2855,...,72.85,87.49,14.42,34.89,227.55,61.28,27.0,70.32,29.68,58.76
3,HarvardX,CS50x,10/15/2012,Introduction to Computer Science,"David Malan, Nate Hardison, Rob Bowden, Tommy ...",Computer Science,1,1,129400,12888,...,11.11,0.0,0.0,1.11,220.9,0.0,28.0,80.02,19.98,58.78
4,HarvardX,PH207x,10/15/2012,Health in Numbers: Quantitative Methods in Cli...,"Earl Francis Cook, Marcello Pagano","Government, Health, and Social Science",1,1,52521,10729,...,47.12,77.45,15.98,32.52,804.41,76.1,32.0,56.78,43.22,88.33


# Data Cleaning #

In [3]:
# Preserve data integrity by creating a copy before modifying it
#  and keep track of the shape of the dataframe
df_original = df.copy(deep = True)
df.shape

(290, 23)

In [4]:
df.drop(["Honor Code Certificates","% Certified of > 50% Course Content Accessed", "Total Course Hours (Thousands)"], axis = 1, inplace=True)

In [5]:
df.shape

(290, 20)

### *Remove non-float value on account of error thrown* ###
The error itself is not visible in the current file but became evident as part of my preliminary analysis. This instance is the only anomaly in the entire dataset.

In [6]:
# Find non-float value based on error thrown ('---')
df[df["% Played Video"] == '---']

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),Certified,% Audited,% Certified,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
9,HarvardX,ER22x,03/02/2013,Justice,Michael Sandel,"Humanities, History, Design, Religion, and Edu...",1,58779,9425,5438,16.05,9.26,---,21.86,20.98,13.67,30.0,60.42,39.58,69.78


In [7]:
# Remove non-float value
df.at[9, "% Played Video"] = 0.

In [8]:
# Check column types
# df.astype({"% Played Video":"float64"}).dtypes

In [9]:
# Recast "% Played Video" column as float to ensure consistency in processing
df["% Played Video"] = [float(val) for val in df["% Played Video"]]

### *Check for duplicate course entries and for null instructor values values* ###
**N.B.** In this section I am primarily trying to find the number of unique courses in the dataset based on course codes.

In [10]:
# Continue tracking shape of the dataframe
df.shape

(290, 20)

In [11]:
# Check the number of unique courses
len(set(df["Course Number"]))

188

In [12]:
df[df["Instructors"].isnull()]

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),Certified,% Audited,% Certified,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
221,HarvardX,CS50x,01/01/2016,Introduction to Computer Science (2016),,Computer Science,4,168335,10245,417,6.09,0.25,69.02,0.0,0.0,0.0,23.0,77.44,22.56,49.51


### *Handle NaN value in Instructors' list* ###

In [13]:
df.at[221, "Instructors"] = ""

# Feature Engineering #

### *Combine records that have the same course code.* ###

Combined records contain:

* A consolidated list of instructors
* A combined list of participants and combined certified number
* An averaging of percentage-based statistics (such as percentage of interactions with course materials or percetage of completions, as well as demographic-based percentages)

Course launch dates are taken from the first record and the year count from the last one.


In [14]:
# Compute the number of instances of each course
course_occurences = df["Course Number"].value_counts()
course_occurences

6.00.1x       9
3.091x        7
6.002x        5
6.00.2x       5
14.73x        4
             ..
8.421.4x      1
HUM1.4x       1
GOV1368.3x    1
HDS3221.2x    1
AMPOx.3       1
Name: Course Number, Length: 188, dtype: int64

In [15]:
# Define function to combine duplicate records

# test_course = "6.00.1x"

def combine_course_instances(course, df=df):
    '''
        Combines multiple instances of the same course preserving original launch dates, combini
        the list of all instructors, and either summing or averaging course statistics, as appropriate.
        
        Inputs:
            - course: course code attached to multiple entries
            - df: dataframe in which the course exists
    
    '''
    instances = df[df["Course Number"]==course]
    #print( instances["Instructors"])
#     print(instances)
    
    # Keep first record of launch date and course title
    institution = list(instances["Institution"])[0]
    course_title = list(instances["Course Title"])[0]
    launch_date = list(instances["Launch Date"])[0]
    
    row = instances.groupby("Course Number").agg({
                                            "Institution": lambda x: institution,
                                            "Course Number": lambda x: course,
                                            "Launch Date": lambda x: launch_date,
                                            "Course Title": lambda x: course_title,
                                            "Instructors": lambda x: get_instructors(instances["Instructors"]),
                                            "Course Subject": lambda x :list(instances["Course Subject"])[0],
                                            "Year":"max",
                                            "Participants (Course Content Accessed)": "sum",
                                            "Certified": "sum",
                                            "Audited (> 50% Course Content Accessed)": "mean",
                                            "Certified": "sum",
                                            "% Audited": "mean",
                                            "% Certified": "mean",
                                            "% Played Video": "mean",
                                            "% Posted in Forum": "mean",
                                            "% Grade Higher Than Zero": "mean",
                                            "Median Hours for Certification": "mean",
                                            "Median Age": "mean",
                                            "% Male": "mean",
                                            "% Female": "mean",
                                            "% Bachelor's Degree or Higher": "mean"
    })
#     print(row)
    return row

def get_instructors(combined_instructors):
    
    '''
        Converts a list of all instructor records into a list of individually separated,
        unique instructor names.
        
        Input:
            - combined_instructors: list containing combined instructor names for one or more
                                    instances of a course
        
        Output:
            - instructors_list: list of unique instructor names across all instances of a course
    '''
    
    # Define empty list to store individual instructors
    instructors_list = []
    
    # Build complete list of instructors
    for cluster in combined_instructors:
        # Store instructors temporarily
        temp_list = cluster.split(',')
        
        # Remove possible spaces at the beginning and
        #    add to list if name not already included
        for instructor in temp_list:
            inst = re.sub(r"(^\s?)", "", instructor)    # remove leading space
            # Append value only if it does not already exist
            if inst not in instructors_list:
                instructors_list.append(inst)
                
#     print(full_list)
    return instructors_list

### *Designate Duplicate Courses and Consolidate Records* ###
This process involves: 
1) splitting unique and duplicate courses in the dataframe 
2) consolidating duplicated records
3) adding the consolidated records to a finalized version of the dataframe

In [16]:
# Create a list of duplicate courses
duplicate_courses = [course for course in course_occurences.keys() if course_occurences[course] > 1]

# Check the size of the list in comparison with previous tally
len(duplicate_courses)

66

In [17]:
# Create an empty dataframe with the appropriate columns that will be used to recombine records
final_df = pd.DataFrame(columns=df.columns)
final_df

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),Certified,% Audited,% Certified,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher


In [18]:
# Traverse list of duplicate courses, combine records, and add row to the dataframe
for course in duplicate_courses:
    new_row = combine_course_instances(course)                      # combine records
    final_df = pd.concat([final_df, new_row], ignore_index=True)    # add combined record to the dataframe

In [19]:
# Verify shape of dataframe after records have been added
final_df.shape

(66, 20)

In [20]:
# Print out sample rows to ensure that columns have been populated appropriately
final_df

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),Certified,% Audited,% Certified,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
0,MITx,6.00.1x,10/16/2013,Introduction to Computer Science and Programming,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,4,364099,6584.666667,29412,17.018889,8.252222,72.924444,8.640000,44.654444,60.734444,26.333333,81.753333,18.246667,63.575556
1,MITx,3.091x,10/09/2012,Introduction to Solid State Chemistry,[Michael Cima],"Science, Technology, Engineering, and Mathematics",4,56421,847.000000,3608,8.805714,4.840000,73.691429,8.097143,22.082857,95.955714,26.142857,74.007143,25.992857,58.424286
2,MITx,6.002x,09/05/2012,Circuits and Electronics,"[Khurram Afridi, Tania Khanna, Anant Agarwal, ...","Science, Technology, Engineering, and Mathematics",3,116335,2131.000000,5169,8.432000,4.010000,71.438000,4.884000,16.932000,82.980000,25.200000,88.522000,11.478000,57.666000
3,MITx,6.00.2x,03/05/2014,Introduction to Computational Thinking and Dat...,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,4,66102,2372.400000,6446,18.022000,9.842000,69.172000,6.984000,30.702000,56.604000,28.000000,86.250000,13.750000,74.190000
4,MITx,14.73x,02/12/2013,The Challenges of Global Poverty,"[Esther Duflo, Abhijit Banerjee, Duflo, Banerj...","Government, Health, and Social Science",4,52060,2434.250000,6670,15.580000,10.285000,71.565000,9.397500,31.895000,57.370000,28.500000,52.722500,47.277500,79.430000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,MITx,JPAL101x,10/05/2015,Evaluating Social Programs,"[Rachel Glennerster, Marc Shotland]","Government, Health, and Social Science",4,8317,752.500000,356,18.110000,4.280000,57.575000,6.305000,24.470000,42.920000,30.000000,50.130000,49.870000,89.620000
62,MITx,CTL.SC1x,05/27/2015,Supply Chain and Logistics Fundamentals,[Chris Caplice],"Government, Health, and Social Science",4,38682,4162.500000,3863,21.665000,10.090000,75.015000,18.235000,27.060000,144.525000,29.000000,76.590000,23.410000,82.675000
63,HarvardX,USW30x,06/02/2014,Tangible Things,[Laurel Ulrich],"Humanities, History, Design, Religion, and Edu...",3,15531,1723.500000,1443,21.245000,8.765000,70.635000,25.675000,29.380000,9.190000,33.000000,35.845000,64.155000,75.100000
64,MITx,CTL.SC2x,09/30/2015,Supply Chain Design,"[Chris Caplice, Yossi Sheffi, James Blayney Ri...","Government, Health, and Social Science",4,27134,3325.000000,3358,24.360000,12.355000,75.790000,15.015000,25.380000,129.300000,29.500000,77.700000,22.300000,85.415000


### *Designate Unique Courses and Add to Finalized Dataframe* ###

In [21]:
# Designate unique courses
unique_courses = [course for course in course_occurences.keys() if course_occurences[course]==1]

# Check that list accords with previous count
len(unique_courses)

122

In [22]:
# Create a filtered dataframe containing only unique course instances
unique_courses_df = df.loc[df["Course Number"].isin(unique_courses)]

# Verify that length of the dataframe accords with previous count
len(unique_courses_df)

122

In [23]:
# Verify that the dataframe contains only unique course numbers
len(set(unique_courses_df["Course Number"]))

122

In [24]:
# Sample printout of unique courses
unique_courses_df

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),Certified,% Audited,% Certified,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
4,HarvardX,PH207x,10/15/2012,Health in Numbers: Quantitative Methods in Cli...,"Earl Francis Cook, Marcello Pagano","Government, Health, and Social Science",1,52521,10729,5058,20.44,9.64,77.45,15.98,32.52,76.10,32.0,56.78,43.22,88.33
8,MITx,8.02x,02/18/2013,Electricity and Magnetism,"Walter Lewin, John Belcher, Peter Dourmashkin,...","Science, Technology, Engineering, and Mathematics",1,39178,3543,1722,9.04,4.40,85.30,5.86,16.04,107.88,26.0,85.42,14.58,56.97
12,HarvardX,CB22x,03/13/2013,The Ancient Greek Hero,"Greg Nagy, L.Muellner","Humanities, History, Design, Religion, and Edu...",1,25873,1803,1395,6.99,5.41,77.05,13.76,18.05,46.26,32.0,53.31,46.69,71.95
14,HarvardX,PH278x,05/15/2013,Human Health and Global Environmental Change,"Aaron Bernstein, Jack Spengler","Government, Health, and Social Science",1,23179,3727,2743,16.10,11.85,85.35,14.48,29.01,21.69,30.0,51.15,48.85,75.19
17,HarvardX,CB22.1x,09/03/2013,The Ancient Greek Hero,Greg Nagy,"Humanities, History, Design, Religion, and Edu...",2,17604,1260,728,7.18,4.15,68.77,6.59,14.96,22.10,31.0,52.11,47.89,68.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,HarvardX,HDS3221.5x,07/05/2016,World Religions Through their Scriptures: Hind...,Neelima Shukla-Bhatt,"Humanities, History, Design, Religion, and Edu...",4,4553,1051,70,23.08,1.54,73.34,16.65,1.54,26.74,31.0,56.97,43.03,74.39
285,HarvardX,MUS24.4x,07/21/2016,First Nights: Symphonie Fantastique,Tom Kelly,"Humanities, History, Design, Religion, and Edu...",4,615,305,20,49.59,3.25,80.81,8.78,3.25,5.93,38.0,56.82,43.18,74.66
286,HarvardX,GSE4x,07/25/2016,Introduction to Family Engagement in Education,Karen Mapp,"Humanities, History, Design, Religion, and Edu...",4,2871,267,20,9.30,0.70,70.11,0.00,0.70,11.33,34.0,25.24,74.76,82.31
288,MITx,6.302.1x,08/01/2016,Introduction to State Space Control,"Jacob White, Joe Steinmeyer","Science, Technology, Engineering, and Mathematics",4,1431,208,8,14.54,0.56,0.00,3.84,5.73,62.38,25.0,93.44,6.56,72.31


In [25]:
# unique_courses_df.reset_index()
# unique_courses_df

In [26]:
# unique_courses_df.index

In [27]:
# Reformat "Instructors" column by looping through all of 
#  unique records and re-assigning reformatted instructors list
for i in range(len(unique_courses_df)):
    unique_courses_df.at[unique_courses_df.index[i], "Instructors"] = get_instructors([unique_courses_df.iloc[i]["Instructors"]])

In [28]:
unique_courses_df

Unnamed: 0,Institution,Course Number,Launch Date,Course Title,Instructors,Course Subject,Year,Participants (Course Content Accessed),Audited (> 50% Course Content Accessed),Certified,% Audited,% Certified,% Played Video,% Posted in Forum,% Grade Higher Than Zero,Median Hours for Certification,Median Age,% Male,% Female,% Bachelor's Degree or Higher
4,HarvardX,PH207x,10/15/2012,Health in Numbers: Quantitative Methods in Cli...,"[Earl Francis Cook, Marcello Pagano]","Government, Health, and Social Science",1,52521,10729,5058,20.44,9.64,77.45,15.98,32.52,76.10,32.0,56.78,43.22,88.33
8,MITx,8.02x,02/18/2013,Electricity and Magnetism,"[Walter Lewin, John Belcher, Peter Dourmashkin...","Science, Technology, Engineering, and Mathematics",1,39178,3543,1722,9.04,4.40,85.30,5.86,16.04,107.88,26.0,85.42,14.58,56.97
12,HarvardX,CB22x,03/13/2013,The Ancient Greek Hero,"[Greg Nagy, L.Muellner]","Humanities, History, Design, Religion, and Edu...",1,25873,1803,1395,6.99,5.41,77.05,13.76,18.05,46.26,32.0,53.31,46.69,71.95
14,HarvardX,PH278x,05/15/2013,Human Health and Global Environmental Change,"[Aaron Bernstein, Jack Spengler]","Government, Health, and Social Science",1,23179,3727,2743,16.10,11.85,85.35,14.48,29.01,21.69,30.0,51.15,48.85,75.19
17,HarvardX,CB22.1x,09/03/2013,The Ancient Greek Hero,[Greg Nagy],"Humanities, History, Design, Religion, and Edu...",2,17604,1260,728,7.18,4.15,68.77,6.59,14.96,22.10,31.0,52.11,47.89,68.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,HarvardX,HDS3221.5x,07/05/2016,World Religions Through their Scriptures: Hind...,[Neelima Shukla-Bhatt],"Humanities, History, Design, Religion, and Edu...",4,4553,1051,70,23.08,1.54,73.34,16.65,1.54,26.74,31.0,56.97,43.03,74.39
285,HarvardX,MUS24.4x,07/21/2016,First Nights: Symphonie Fantastique,[Tom Kelly],"Humanities, History, Design, Religion, and Edu...",4,615,305,20,49.59,3.25,80.81,8.78,3.25,5.93,38.0,56.82,43.18,74.66
286,HarvardX,GSE4x,07/25/2016,Introduction to Family Engagement in Education,[Karen Mapp],"Humanities, History, Design, Religion, and Edu...",4,2871,267,20,9.30,0.70,70.11,0.00,0.70,11.33,34.0,25.24,74.76,82.31
288,MITx,6.302.1x,08/01/2016,Introduction to State Space Control,"[Jacob White, Joe Steinmeyer]","Science, Technology, Engineering, and Mathematics",4,1431,208,8,14.54,0.56,0.00,3.84,5.73,62.38,25.0,93.44,6.56,72.31


### Combine Unique and Duplicate Courses ###

In [29]:
# Add unique courses to final_df
final_df = pd.concat([final_df, unique_courses_df])

In [30]:
# Reset index for consistent further processing
final_df = final_df.reset_index()

In [31]:
# final_df

# Text Pre-Processing #

In [32]:
# Import text processing libraries
import nltk
import re

In [33]:
# Define a subset of the dataframe that contains only necessary columns
CB_df = final_df[["Institution","Course Number", "Course Title", "Instructors", "Course Subject", "% Certified"]].copy()

In [34]:
CB_df.head()

Unnamed: 0,Institution,Course Number,Course Title,Instructors,Course Subject,% Certified
0,MITx,6.00.1x,Introduction to Computer Science and Programming,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,8.252222
1,MITx,3.091x,Introduction to Solid State Chemistry,[Michael Cima],"Science, Technology, Engineering, and Mathematics",4.84
2,MITx,6.002x,Circuits and Electronics,"[Khurram Afridi, Tania Khanna, Anant Agarwal, ...","Science, Technology, Engineering, and Mathematics",4.01
3,MITx,6.00.2x,Introduction to Computational Thinking and Dat...,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,9.842
4,MITx,14.73x,The Challenges of Global Poverty,"[Esther Duflo, Abhijit Banerjee, Duflo, Banerj...","Government, Health, and Social Science",10.285


### *Define Functions for Text Processing* ###

In [35]:
# DEFINE FUNCTIONS FOR TEXT PRE-PROCESSING

# Stop words list
stopwords_list = ['the','a','an', 'and','or','for','so','as','either','or', 'to', 
                  'of', 'in', 'on', 'i','me','its', 'they', 'them', 'he', 'she''after',
                  'where', 'unless','whether','because','since','until','before','when',
                  'while', 'which', 'whose', 'this','that', 'those','these', 'with', 
                  'at','by','down','during', 'for','from','in','into','of','off','on','over', 'past']


# Remove punctuation/special characters
def clean_text(text):
    '''
        Function that removes punctuation and special characters
        
        Input: 
             String of text
        
        Output: 
            String of text with punctuation and special characters removed
    '''

    
    # Define punctuation
    hyph = r"(\s+\-+|\–+\s+)"   # remove dashes only if accompanied by spaces; account for different lengths of dashes
    punct = r"[:;,\.!?]" 
 
    
    sp_char = r"[%\^\&\$#@\*\+\[\]\(\)\\\/\_\"\'\’]"
    
    # Set to lower case
    text = text.lower()

    # Remove dashes and replace with space
    cleaned_text = re.sub(hyph, " ", text)
    
    # Remove punctuation
    cleaned_text = re.sub(punct, "", cleaned_text)
    
    # Remove special characters
    cleaned_text = re.sub(sp_char, "", cleaned_text)

    
    return cleaned_text
    

def remove_stopwords(tokens, stopwords=stopwords_list):
    '''
        Function that removes stop words from a tokenized text.
        
        Input:
            List of tokens
            
        Output:
            List of tokens with stopwords removed
            
    '''
    
    filtered_tokens = [token for token in tokens if token not in stopwords]
    
    return list(set(filtered_tokens))
    
# #clean_text('the ancient greek hero in 24 hours (hours 12-15): cult of heroes')
# clean_text('atomic and optical physics: atom-light interactions 1 s-- matrix elements and quantized field')

In [36]:
# combine genre and title: https://ybshankar010.medium.com/from-genres-to-conversations-next-gen-movie-recommendations-with-llms-6041aab6defa
subj_codes = {
    'cs': 'Computer Science',
    'ghss': 'Government, Health, and Social Science',
    'hhdre': 'Humanities, History, Design, Religion, and Education',
    'stem': 'Science, Technology, Engineering, and Mathematics'
}


def assign_subject_code (subj, codes_dict = subj_codes):
    '''
        Assign codes to subject
        
        Inputs:
            - subj: specific subject listed in database
            - codes_dict: dictionary containing subject codes and full subject names
            
        Outputs:
            - course code
    
    '''
    
    for code, subject in codes_dict.items():
        if subject == subj:
            return code

### *Convert course titles to keyword tokens* ###

In [37]:
# Isolate titles in dataframe
titles = CB_df["Course Title"]
# titles

In [38]:
# Remove punctuation and special characters
clean_titles = [clean_text(title) for title in titles]
#clean_titles

In [39]:
# Tokenize title strings
tokenized_titles = [nltk.word_tokenize(title) for title in clean_titles]
#tokenized_titles

In [40]:
# Filter stopwords
filtered_tokens = [remove_stopwords(title) for title in tokenized_titles]
#filtered_tokens

### *Assign subject codes as unique tokens* ###

In [41]:
CB_df["subj_codes"] = [assign_subject_code(subject) for subject in CB_df["Course Subject"]]
CB_df.head()

Unnamed: 0,Institution,Course Number,Course Title,Instructors,Course Subject,% Certified,subj_codes
0,MITx,6.00.1x,Introduction to Computer Science and Programming,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,8.252222,cs
1,MITx,3.091x,Introduction to Solid State Chemistry,[Michael Cima],"Science, Technology, Engineering, and Mathematics",4.84,stem
2,MITx,6.002x,Circuits and Electronics,"[Khurram Afridi, Tania Khanna, Anant Agarwal, ...","Science, Technology, Engineering, and Mathematics",4.01,stem
3,MITx,6.00.2x,Introduction to Computational Thinking and Dat...,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,9.842,cs
4,MITx,14.73x,The Challenges of Global Poverty,"[Esther Duflo, Abhijit Banerjee, Duflo, Banerj...","Government, Health, and Social Science",10.285,ghss


### Combine subject codes and titles for keyword list ###

In [42]:
# Zip titles and subject codes and flatten zip
# Reference consulted for flattening a zipped list: https://stackoverflow.com/questions/40709321/python-flattening-a-zip
CB_df["title_and_subject"]= [list((*a, b)) for a, b in zip(filtered_tokens, list(CB_df['subj_codes']))]

# Check results
CB_df.head()

Unnamed: 0,Institution,Course Number,Course Title,Instructors,Course Subject,% Certified,subj_codes,title_and_subject
0,MITx,6.00.1x,Introduction to Computer Science and Programming,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,8.252222,cs,"[computer, science, programming, introduction,..."
1,MITx,3.091x,Introduction to Solid State Chemistry,[Michael Cima],"Science, Technology, Engineering, and Mathematics",4.84,stem,"[state, chemistry, solid, introduction, stem]"
2,MITx,6.002x,Circuits and Electronics,"[Khurram Afridi, Tania Khanna, Anant Agarwal, ...","Science, Technology, Engineering, and Mathematics",4.01,stem,"[circuits, electronics, stem]"
3,MITx,6.00.2x,Introduction to Computational Thinking and Dat...,"[Eric Grimson, John Guttag, Ana Bell]",Computer Science,9.842,cs,"[thinking, computational, introduction, data, ..."
4,MITx,14.73x,The Challenges of Global Poverty,"[Esther Duflo, Abhijit Banerjee, Duflo, Banerj...","Government, Health, and Social Science",10.285,ghss,"[global, challenges, poverty, ghss]"


# Process Keywords for the Content-Based Filter #

In [43]:
# Import necessary packages
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

### *Apportion a test set from the CB dataframe for future testing* ###

In [44]:
CB_train_df, CB_test_df = train_test_split(CB_df, test_size=0.1, random_state=7,stratify=CB_df["subj_codes"])

### *Create TF-IDF vectors from training data* ###

In [45]:
# Create a dummy vectorizer function in order to avoid the in-built tokenization process
#  of the TfidfVectorizer
def dummy(t):
    return t

# Define a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(analyzer=dummy)

In [46]:
# Extract keywords and preserve attachment to index numbers
training_keywords = [(i, row["title_and_subject"]) for (i, row) in CB_train_df.iterrows()]
#training_keywords

In [47]:
# Extract keywords only for TF-IDF vectorization
term_list = [term[1] for term in training_keywords]
# term_list

In [48]:
# Transform training set
tfidf = vectorizer.fit_transform(term_list)

# Check vocabulary
#vectorizer.get_feature_names() # vectorizer.get_feature_names_out()

### *Prepare Test Data* ###
This stage consists in extracting keywords and respective indices from the test set and vectorizing the keywords.

In [49]:
# Collect test keywords along with their respective indices
test_keywords = [(i, row["title_and_subject"]) for (i, row) in CB_test_df.iterrows()]
# test_keywords

In [50]:
# Vectorize the keywords for further processing
test_vecs = [vectorizer.transform([terms[1]]) for terms in  test_keywords]
#test_vecs

# Content-Based Recommender Pipeline #
The CB recommender pipeline is based on a function that takes a set of keywords, be it part of the test set or directly inputted by a user, encodes it using a TF-IDF vectorizer that was fit to the dataset (in this case, the training subset), and returns a list of recommendations depending upon the information available from the user. This pipeline envisions 3 different tiers of useres:
1) Users whose preference is utterly unknown
2) Users who include only a subject of interest
3) Users who define a set of keywords

In [120]:
# Define functions for the CB Recommender Pipeline

def CB_recommender(user, df=CB_df, n=5):
    
    '''
        
    '''
    
    # Case 1: No information from user
    if (user["subject"]=="" and len(user["keywords"])==0):
               
        # Filter dataframe in accordance with completion rates
        filtered_df = df.sort_values(by="% Certified", ascending=False)[:n]
    
        return filtered_df

    # Case 2: Subject Information Only
    elif (user["subject"]!="" and len(user["keywords"])==0):
        # consider including try statement
        filtered_df = df.loc[[user["subject"] in keywords for keywords in df["title_and_subject"]]]
        
        return filtered_df.sort_values(by="% Certified", ascending=False)[:n]
    
    # Case 3: Subject and Keywords
    else:
        
        # Combine keywords and subject and run search on combined list
        tokens = user["keywords"]
        if (user["subject"]!=""):
            tokens.append(user["subject"])
        
        
        cosine_similarities = keyword_search(tokens)
#         print(cosine_similarities)
        # encode and measure cosine similarity
        return np.argsort(cosine_similarities)[:(-n-1):-1]

    
def keyword_search (query, encoded_course_matrix=tfidf):
    
    query_vec = encode_query(query)
#     print(query_vec)
    
    res_vec = np.zeros(encoded_course_matrix.shape[0])
    
    for i,course_vec in enumerate(encoded_course_matrix):
        res_vec[i] = cosine_similarity(course_vec, query_vec)[0][0] # store value only 
    
    return res_vec

def encode_query (keywords, vectorizer = vectorizer):
    
    return vectorizer.transform([keywords])

In [52]:
# test_user1 = {
#     "subject": "",
#     "keywords": []
# }

# test_user2 = {
#     "subject": "hhdre",
#     "keywords": []
# }

# test_user3 = {
#     "subject": "cs",
#     "keywords": ["python", "programming", "introduction"]
# }


# Testing #


In [53]:
# vectorizer.transform(test_keywords[0][1]).toarray()

In [54]:
# To see output
#test_vecs[i].toarray()

In [55]:
# tfidf.shape

In [56]:
# results = np.empty((tfidf.shape[0], len(test_vecs)))


# for i, train_vec in enumerate(tfidf):
#     for j, test_vec in enumerate(test_vecs):
#         results[i,j] = cosine_similarity(train_vec,test_vec)[0][0]
# # results

In [57]:
# cosine_similarity(tfidf[0],test_vecs[2])

In [58]:
# cosine_similarity(tfidf[1],test_vecs[8])

In [59]:
# results[:6,:10]

In [60]:
# results[0,2]

In [61]:
# results = np.empty((tfidf.shape[0], len(test_vecs)))


# for i, train_vec in enumerate(tfidf[:6]):
#     for j, test_vec in enumerate(test_vecs[:3]):
#         results[i,j] = cosine_similarity(train_vec,test_vec)[0][0]

In [62]:
# m = []

# for i in range(5):
#     m.append([])
#     for j in range(6, 10):
#         m[i].append(j)
# m

In [63]:
# REFER BACK TO INDEX IN ORDER TO SORT BY SIMILARITY

# results = np.empty((tfidf.shape[0], len(test_vecs)))

# for i, train_vec in enumerate(tfidf):
#     for j,test_vec in enumerate(test_vecs):
#         results[i,j] = cosine_similarity(train_vec, test_vec)[0][0] # store value only
    
# results.shape

In [64]:
# results[:10, :10]

In [65]:
# RETURN TO THIS!!!!! PERHAPS INCLUDE IN EVAUATION SECTION !!!
# plt.figure(figsize = (10, 10))
# ax = sns.heatmap(results)

## Sort Recommendations for Each Test Vector ##

In [66]:
# https://stackoverflow.com/questions/66679020/python-quickest-way-to-sort-list-and-keep-indexes
#list(enumerate(results[:,0]))

In [67]:
# recommendations = np.argsort(results[:,5])[:-10:-1]
# recommendations

In [68]:
# test_keywords[5]

In [69]:
# rec_list = []
# for rec in recommendations:
#     print(training_keywords[rec])
#     rec_list.append(training_keywords[rec])

In [70]:
# test_df.loc[179]

In [71]:
#[train_df.loc[rec[0]] for rec in rec_list]

In [72]:
# test_keywords[0]

In [73]:
# test_df.loc[176]

In [74]:
# recommendations = np.argsort(results[:,0])[:-10:-1]
# recommendations

In [75]:
# rec_list = []
# for rec in recommendations:
#     print(training_keywords[rec], train_df.loc[training_keywords[rec][0], "Course Number"])
#     rec_list.append(training_keywords[rec])

In [76]:
# x = np.zeros(tfidf.shape[0])
# x

In [77]:
# tfidf.shape

In [78]:
# CB_df.columns

In [79]:
# CB_recommender(test_user1, df)

In [80]:
# # filter df based on whether subject is in list
# CB_df.loc[["hhdre" in keywords for keywords in CB_df["title_and_subject"]]]

In [81]:
# CB_recommender(test_user2)

In [82]:
#CB_recommender(test_user3)

In [83]:
#recs = CB_recommender(test_user3)
#recs

In [84]:
# for rec in recs:
#     print(training_keywords[rec], train_df.loc[training_keywords[rec][0], "Course Number"])

In [85]:
# test_user4 = {
#     "subject": "",
#     "keywords": ["inroduction", "mobile", "computation"]
# }
# recs = CB_recommender(test_user4)
# recs

In [86]:
# for rec in recs:
#     print(training_keywords[rec], train_df.loc[training_keywords[rec][0], "Course Number"])

In [87]:
# test_user5 = {
#     "subject": "",
#     "keywords": ["python", "programming", "education"]
# }
# recs = CB_recommender(test_user4)
# recs

In [88]:
# for rec in recs:
#     print(training_keywords[rec], train_df.loc[training_keywords[rec][0], "Course Number"])

# DEFINE INITIAL TESTING FUNCTION AND INSPECT RESULTS #

In [89]:
# train_df.iloc[145]

In [90]:
# cbres1 = CB_testing()

In [91]:
# sns.heatmap(cbres1)

In [92]:
# test_df.loc[test_keywords[0][0], "Course Number"]

In [93]:
# resuts = test_cosine_similarity()
# results[:5, :5]

In [94]:
# results.shape

In [95]:
# results = test_cosine_similarity()

In [96]:
# tabulate_results(results)

In [97]:
# train_df.iloc[125]

In [98]:
# (30, ['aerodynamics', 'vehicle', 'flight', 'stem'])
# (5, ['life', 'biology', 'introduction', 'secret', 'stem'])
# (14, ['uncertainty', 'probability', 'introduction', 'science', 'stem'])
# (118, ['state', 'control', 'introduction', 'space', 'stem'])
# (57, ['feedback', 'theory', 'control', 'introduction', 'stem'])
# (31, ['bioconductor', 'introduction', 'ghss'])
# (1, ['state', 'chemistry', 'solid', 'introduction', 'stem'])
# (7, ['science', 'introduction', 'computer', 'cs'])
# (33, ['design', 'introduction', 'game', 'hhdre'])

In [99]:
# train_df.iloc[145]

In [100]:
# train_df.loc[145]

# ANALYSIS OF TEST RESULTS #
* There is still and issue with course-codes, so perhaps the pre-processing stage must be chagned
* Because recommendations are based on title and subject keywords, not all recommendations are direclty relevant
    * however, approaching the CB recommender in this way allows and element of serendipity, which is sometimes desirable
    * one way to improve recommendations would be to include course descriptions and thend o a keyword search based on those
* instructor information and other stats coudl probably be used to complement hte CF recommender

## PRECISION AND RECALL @K ##

In [101]:
# def tabulate_results(results, test_set = test_keywords, training_set = training_keywords, encoded_course_matrix=tfidf):

In [102]:
# test_keywords[:5]

In [103]:
# training_keywords[:2]

In [104]:
# results.shape[1]

In [105]:
# k = 5
# stats_at_k = {}

# for i in range(results.shape[1]):
        
#         # Isolate the measured closeness to all vectors in the course matrix
#         top_k_recs = np.argsort(results[:,i])[:-6:-1]
# #         print(top_k_recs)
        
#         test_subj = test_keywords[i][1][-1]
#         print(f"Test subject: {test_subj}")
        
#         rec_subj = [training_keywords[rec][1][-1] for rec in top_k_recs]
#         print(f"Rec subject: {rec_subj}")
        
#         # Calculate accuracy
#         count = [1 if test_subj==rec_subj[i] else 0 for i in range(len(top_k_recs))]
        
#         stats_at_k[i] = sum(count)
        

In [106]:
# stats_at_k

In [107]:
# https://krishnapullak.medium.com/understanding-precision-recall-and-f-score-at-k-in-recommender-systems-7146a0dce68e

# Precision@K: proportion of relevant items among top K recs
# p@k = # relevant items in top k / k

# Recall@K: ability of recommender to identify all relevant items within top K recs; system comprehensiveness
# r@k = # relevant items in top k / total number of relevant items


# https://neptune.ai/blog/recommender-systems-metrics
# *** https://www.evidentlyai.com/ranking-metrics/precision-recall-at-k ***

In [108]:
# precision_at_k = {}

# for i in range(len(stats_at_k)):
#     precision_at_k[i] = stats_at_k[i]/k
    
# precision_at_k

In [109]:
# avg_precision_at_k = sum(precision_at_k.values())/len(precision_at_k)
# avg_precision_at_k

In [110]:
# k = 5
# word_stats_p_at_k = {}
# avg_p_at_k = {}

# for i in range(results.shape[1]):
        
#         # Isolate the measured closeness to all vectors in the course matrix
#         top_k_recs = np.argsort(results[:,i])[:-6:-1]
#         print(top_k_recs)
        
#         test_words = test_keywords[i][1]
#         print(f"Test subject: {test_words}, {len(test_words)}")
        
#         rec_words = [training_keywords[rec][1] for rec in top_k_recs]
# #         print(f"Rec subject: {rec_words}")
        
#         word_stats_p_at_k[i] = []
#         # https://stackoverflow.com/questions/1388818/how-can-i-compare-two-lists-in-python-and-return-matches
#         for rec in rec_words:
#             print(f"Set: {set(test_words) & set(rec)}")
#             print(rec)
#             word_stats_p_at_k[i].append(len(set(test_words) & set(rec))/len(rec))
            
#         avg_p_at_k[i] = sum(word_stats_p_at_k[i])/5
        
            
        
#         print("-------------------------------------------------------------------------------------------------------")
#         # Calculate accuracy
# #         count = [1 if test_subj==rec_subj[i] else 0 for i in range(len(top_k_recs))]
        
# #         stats_at_k[i] = sum(count)
        

In [111]:
# word_stats_p_at_k

In [112]:
# avg_p_at_k

In [113]:
# sum(avg_p_at_k.values())/len(avg_p_at_k)

In [114]:
# Recall@K: ability of recommender to identify all relevant items within top K recs; system comprehensiveness
# r@k = # relevant items in top k / total number of relevant items
# total relevant items: all words in test string
# relevant items in top k = number of common words
# divide by 5 (k) for avg

In [115]:
# k = 5
# word_stats_r_at_k = {}
# avg_r_at_k = {}

# for i in range(results.shape[1]):
        
#         # Isolate the measured closeness to all vectors in the course matrix
#         top_k_recs = np.argsort(results[:,i])[:-6:-1]
# #         print(top_k_recs)
        
#         test_words = test_keywords[i][1]
#         print(f"Test subject: {test_words}, {len(test_words)}")
        
#         rec_words = [training_keywords[rec][1] for rec in top_k_recs]
# #         print(f"Rec subject: {rec_words}")
        
#         word_stats_r_at_k[i] = []
#         # https://stackoverflow.com/questions/1388818/how-can-i-compare-two-lists-in-python-and-return-matches
#         for rec in rec_words:
#             print(len(set(test_words) & set(rec)))
#             word_stats_r_at_k[i].append(len(set(test_words) & set(rec))/len(test_words))
            
#         avg_r_at_k[i] = sum(word_stats_r_at_k[i])/5
# #             print(rec)
            
        
#         print("-------------------------------------------------------------------------------------------------------")
#         # Calculate accuracy
# #         count = [1 if test_subj==rec_subj[i] else 0 for i in range(len(top_k_recs))]
        
# #         stats_at_k[i] = sum(count)
        

In [116]:
# word_stats_r_at_k

In [117]:
# avg_r_at_k

In [118]:
# sum(avg_r_at_k.values())/len(avg_r_at_k)

In [119]:
# tfidf.toarray()