In [553]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from collections import Counter, OrderedDict

# Load library
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download the set of stop words the first time
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
import re

import math

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

%env GOOGLE_APPLICATION_CREDENTIALS=/Users/cesleemontgomery/masters/cs6460/CourseRec/courserec-adfe3-firebase-adminsdk-pm32q-6cd7a11195.json

env: GOOGLE_APPLICATION_CREDENTIALS=/Users/cesleemontgomery/masters/cs6460/CourseRec/courserec-adfe3-firebase-adminsdk-pm32q-6cd7a11195.json


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package words is already up-to-date!


# 2.3 - Modeling - content_based_model - Investigate the viability of the Mooney et al. text categorization ML approach.

### 1. Extract Data from Corpuses: Courses - Video Subtitles (Udacity), User Interests - Academic-related Articles (Wikipedia)

In [530]:
course_subs_DF = pd.read_csv('../../data/raw/videosubtitles_udacity/course_video_subtitles_udacity.csv')
course_subs_DF = course_subs_DF[pd.notnull(course_subs_DF.course_name)]
wiki_page_path_DF = pd.read_csv('../../data/raw/articles_wikipedia/academic_outline_wikipedia_pages.csv')

In [531]:
print("Data Shapes: \n")
print("Course - Subtitles: ", course_subs_DF.shape)
print("User - Academic Articles", wiki_page_path_DF.shape)

Data Shapes: 

Course - Subtitles:  (28, 6)
User - Academic Articles (985, 9)


### 2. TODO - Pre-Processing

### 3. TODO - Calculate TF-IDF to see if keywords emerge.

### 4. Naive Bayes implementation

#### 4.1 Build table of P( word | Course ) words in Vocabulary x Courses*

*\*prior calculation not required since it won't affect rank of results.*

In [524]:
# Building function for easy comparison
def get_counter_from_list(templist, words):

    #lowercase,remove punctuation and non-alpha, split whitespace
    templist = " ".join(w for w in nltk.wordpunct_tokenize(str(templist)) \
         if w.lower() in words).lower()
    templist = re.sub("[^a-zA-Z\s]+", "", templist).split()

    # Load stop words, lemmatizer
    stop_words = stopwords.words('english')
    lemma = nltk.wordnet.WordNetLemmatizer()

    # Remove stop words, lowercase, lemmatize
    counter = Counter([lemma.lemmatize(word.lower()) for word in templist if word not in stop_words])
    return counter

In [532]:
words = set(nltk.corpus.words.words())

#create vocab from course subtitles
vocabulary = set()

work_df = course_subs_DF.copy()
work_df['subtitles_word_freq'] = pd.Series(dtype=object)

#1. Calc word frequencies for each course
for idx, course in work_df.iterrows():
    print(course.course_name)
    #add Counter to df
    word_freqs = get_counter_from_list(course.subtitles, words)
    print(word_freqs.most_common(10), '\n\n')
    
    vocabulary.update(list(word_freqs.keys()))
    work_df.at[idx,'subtitles_word_freq'] = word_freqs
    
#calc likelihoods for each word | course
NB_model_features_DF = pd.DataFrame(index=sorted(vocabulary), columns=course_subs_DF.course_name)

#2. log likelihood = log of rel. word frequency
    # count of occurences + 1 / total words + Vocab (laplace smoothing)
for idx, course in work_df.iterrows():
    word_freqs = course.subtitles_word_freq
    for word, count in word_freqs.items():
        logLikelihood = math.log(count + 1 / float(sum(word_freqs.values()) + len(vocabulary)))
        NB_model_features_DF.at[word, course.course_name] = logLikelihood

Artificial Intelligence for Robotics
[('de', 2895), ('la', 2231), ('e', 1711), ('x', 1243), ('el', 1079), ('en', 850), ('robot', 787), ('un', 678), ('se', 656), ('para', 594)] 


Intro to High-Performance Computing
[('one', 601), ('time', 523), ('n', 470), ('algorithm', 375), ('two', 374), ('p', 302), ('want', 282), ('first', 268), ('vertex', 265), ('memory', 262)] 


Machine Learning
[('de', 6329), ('e', 4809), ('um', 2830), ('right', 2600), ('going', 2438), ('one', 2274), ('x', 2161), ('se', 2113), ('thats', 2005), ('para', 1941)] 


Database Systems Concepts and Design
[('user', 599), ('one', 445), ('look', 277), ('data', 267), ('going', 217), ('table', 214), ('regular', 213), ('first', 210), ('two', 207), ('result', 204)] 


Knowledge-Based Artificial Intelligence: Cognitive Systems
[('one', 660), ('knowledge', 660), ('problem', 655), ('example', 642), ('well', 501), ('u', 494), ('case', 441), ('agent', 441), ('reasoning', 424), ('learning', 404)] 


Educational Technology
[('reall

In [534]:
NB_model_features_DF.tail(20)

course_name,Artificial Intelligence for Robotics,Intro to High-Performance Computing,Machine Learning,Database Systems Concepts and Design,Knowledge-Based Artificial Intelligence: Cognitive Systems,Educational Technology,Introduction to Operating Systems,Computer Networks,Special Topics: Big Data for Health Informatics,Advanced Operating Systems,...,Introduction to Information Security,Embedded Systems Optimization,Computational Photography,Machine Learning for Trading,Reinforcement Learning,Human-Computer Interaction,Software Development Process,Intro to Cyber Physical Systems Security,Intro to Graduate Algorithms,Software Architecture and Design
za,,,,,,,,,,,...,,,,,,,,,,
zag,,,,,1.49024e-05,,,,,,...,,,,,,,,,,
zaman,,,,,,,,,,,...,,,,,,,,,,
zar,,,,,,,,,,,...,,,,,,,,,,
zat,,,,,,,,,,,...,,,,,,,,,,
zebra,,,,,,,,,,0.693152,...,,,1.79176,,,,,,,
zel,,,,,,,,,,,...,,,,,,,,,,
zero,4.26268,4.11087,6.44095,1.09862,1.3863,,3.80666,3.4012,1.79176,4.43082,...,4.17439,3.63759,4.45435,3.66356,5.97635,0.69316,3.49651,2.63906,5.54518,3.4012
zeta,1.48345e-05,,,,,,,,,,...,,,,,,,,,,
zig,,,,,1.49024e-05,,,,,,...,,,,2.1206e-05,,,,,,


#### 4.2 Classifier func given feature table

In [535]:
def classifier(documents, model, transpose_and_sort=True):
    course_posteriors_DF = []
    
    if documents:
        for doc_name, doc_words in documents.items():
            model_subset_words = model[model.index.isin(doc_words)]
            logPosteriors = model_subset_words.sum(0)
            logPosteriors.rename(doc_name, inplace=True)
            course_posteriors_DF.append(logPosteriors)
        if transpose_and_sort:
            return pd.concat(course_posteriors_DF).transpose().sort_values(ascending=False)
        else:
            return pd.concat(course_posteriors_DF)

In [536]:
model = NB_model_features_DF
tempdict = get_counter_from_list(wiki_page_path_DF[wiki_page_path_DF['category_h2_1'] == 'Formal Sciences'].page_text.values, words)

print('Formal Sciences Matches: \n', classifier({'Formal Sciences':list(tempdict.keys())}, model))

Formal Sciences Matches: 
 course_name
Computer Vision                                               6738.634173
Machine Learning                                              6491.038016
Reinforcement Learning                                        6378.698348
Advanced Operating Systems                                    6250.490691
Software Architecture and Design                              5028.860042
Introduction to Information Security                          4992.859305
Introduction to Operating Systems                             4803.783507
Knowledge-Based Artificial Intelligence: Cognitive Systems    4568.264932
Embedded Systems Optimization                                 4550.793962
Computational Photography                                     4538.386986
High Performance Computer Architecture                        4480.226026
Artificial Intelligence                                       4451.344107
Intro to Graduate Algorithms                                  3868.429671

Scary believable :DDDD

#### 4.3 Pre-compute table of P( Course | Interests ) 

In [537]:
work_list = []

model = NB_model_features_DF

print('Pre-computing course rankings for each interest.')
for idx, interest in wiki_page_path_DF.iterrows():
    work_dict = get_counter_from_list(wiki_page_path_DF.iloc[idx].page_text, words)
    print(interest.page_path_label)
    work_list.append(classifier({interest.page_path_label:list(work_dict.keys())}, model))

NB_model_recommendations_DF = pd.concat(work_list, axis=1); #display(NB_model_recommendations_DF.transpose())

Pre-computing course rankings for each interest.
Humanities
Humanities > Arts
Humanities > Arts > Performing arts
Humanities > Arts > Performing arts > Music
Humanities > Arts > Performing arts > Music > Accompanying
Humanities > Arts > Performing arts > Music > Chamber music
Humanities > Arts > Performing arts > Music > Church music
Humanities > Arts > Performing arts > Music > Conducting
Humanities > Arts > Performing arts > Music > Early music
Humanities > Arts > Performing arts > Music > Jazz studies
Humanities > Arts > Performing arts > Music > Musical composition
Humanities > Arts > Performing arts > Music > Music education
Humanities > Arts > Performing arts > Music > Music history
Humanities > Arts > Performing arts > Music > Musicology
Humanities > Arts > Performing arts > Music > Ethnomusicology
Humanities > Arts > Performing arts > Music > Music theory
Humanities > Arts > Performing arts > Music > Orchestral studies
Humanities > Arts > Performing arts > Music > Organology
Hu

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


In [538]:
NB_model_recommendations_DF.transpose().tail(20)

Unnamed: 0,Advanced Operating Systems,Artificial Intelligence,Artificial Intelligence for Robotics,Compilers - Theory and Practice,Computational Photography,Computer Networks,Computer Vision,Data and Visual Analytics,Database Systems Concepts and Design,Educational Technology,...,Introduction to Operating Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems,Machine Learning,Machine Learning for Trading,Network Security,Reinforcement Learning,Software Analysis,Software Architecture and Design,Software Development Process,Special Topics: Big Data for Health Informatics
Applied Sciences > Medicine and health > Radiology,1215.32311,875.092659,738.367832,851.135578,1062.892081,741.841726,1321.135164,779.458184,634.293415,728.262546,...,1058.922186,1014.362411,1181.988501,782.199425,556.815788,1155.222479,790.074043,1069.334969,878.850772,656.198341
Applied Sciences > Medicine and health > Recreational therapy,558.505185,388.227082,306.052538,370.08892,391.313894,330.659961,494.179277,308.279636,290.810159,333.620286,...,483.606905,457.113821,479.71796,336.036988,253.387513,495.015252,339.76465,482.131067,398.738017,254.506365
Applied Sciences > Medicine and health > Rehabilitation medicine,430.141937,351.589409,266.93764,324.09478,335.742262,269.187078,425.739458,293.90918,242.895999,276.471835,...,387.426274,378.051816,439.52242,272.471969,233.763976,440.977288,278.356959,380.802042,333.401957,281.183989
Applied Sciences > Medicine and health > Respiratory therapy,947.09575,629.342321,525.211746,661.545919,671.432759,557.16076,888.146179,548.074382,494.560913,624.549086,...,882.653107,770.673391,855.850581,550.945867,447.36316,887.472102,564.037756,866.193028,719.31602,492.464145
Applied Sciences > Medicine and health > Sleep medicine,1142.238156,827.802252,683.126579,806.772024,901.592729,693.51926,1153.863899,726.209974,619.977724,706.951296,...,1010.956046,965.34127,1123.985915,725.075947,552.451974,1114.777262,742.37377,1067.983452,866.772102,617.887022
Applied Sciences > Medicine and health > Speech-language pathology,956.123519,643.817359,524.759636,634.241232,635.245467,545.660049,834.679042,574.977621,488.377856,597.311204,...,830.459326,761.241748,827.672968,540.418063,453.213003,837.67768,568.91105,829.890246,689.645002,505.053878
Applied Sciences > Medicine and health > Sports medicine,366.026428,255.382203,187.366832,237.691825,272.99125,227.607757,342.783671,211.121081,210.985574,250.033463,...,315.832602,304.301611,325.196914,225.905343,178.693818,322.370092,200.087032,335.158236,282.420319,227.275687
Applied Sciences > Medicine and health > Surgery,1584.878625,1167.321798,987.757952,1177.387995,1267.14725,951.850705,1639.078349,1018.84823,822.358791,944.1926,...,1376.582999,1342.982403,1550.948605,989.792685,742.283091,1523.579136,1007.082369,1440.038617,1178.762697,846.494721
Applied Sciences > Medicine and health > Surgery > Bariatric surgery,1127.507057,892.135946,739.992562,842.626045,918.844707,733.088624,1204.083017,787.289043,599.589884,683.997972,...,1036.933857,963.277787,1152.505168,768.226596,528.849214,1157.911076,755.974831,1020.201827,835.013398,633.321012
Applied Sciences > Medicine and health > Surgery > Cardiothoracic surgery,1114.924314,919.446464,741.397572,823.058302,898.663057,717.718084,1195.75082,755.257527,626.371345,681.941125,...,1010.485997,910.584082,1138.079372,736.820048,522.877635,1132.960836,728.530395,1002.122597,835.775439,642.125961


#### 4.4 Extract Explanations

### 5 Offline Evaluation (via manually generated scenarios)

This is the closest we'll come to evaluation since we don't have historical user data.  Let's compare manually to other sane baselines.

In [539]:
course_key = {
    #Hand-picked as more related to social/humanities disciplines
    'Human-Computer Interaction': 'red',
    'Intro to Health Informatics': 'red',
    'Educational Technology': 'red',

    #Hand-picked as more related to engineering/systems disciplines
    'Database Systems Concepts and Design': 'green',
    'Computer Networks': 'green',
    'Compilers - Theory and Practice': 'green',

    #Hand-picked as more related to math/highly quantitative disciplines
    'Machine Learning': 'blue',
    'Reinforcement Learning': 'blue',
    'Data and Visual Analytics': 'blue'
}

def color_hand_classified_courses(course, key=course_key):
    """
    Takes a course (string) and colors specific courses 
    according to a hand-created classification key, designed
    to compare broad sub-categories of Computer Science 
    across academic interests.
    """
    if course in key.keys():
        color = key[course]
    else: color = 'gainsboro'
    return 'background-color: %s' % color

In [542]:
#1. write Rank x Categories (values=Course)
def build_comparison_DF(df): 
    work_df_ranked = pd.DataFrame(index=pd.Index(range(1,29), name='rank'))
    work_df = df
    for interest in work_df:
        tempdf = work_df[interest].rank(ascending=False).astype('int').rename('rank').reset_index().set_index('rank')
    #     print(work_df[interest].rank(ascending=False).sort_values())
    #     break
        tempdf.columns = [interest]
        work_df_ranked = work_df_ranked.join(tempdf)
    return work_df_ranked

#### 5.1 Categories Scenarios (w/Explanations) (Comparison to Random or TF-IDF)

In [546]:
#1. write Rank x Categories (values=Course) ; get categories (will serve as spectrum from social to mathy/engr-y [])
categories_DF = pd.DataFrame(NB_model_recommendations_DF.loc[:,wiki_page_path_DF.category_h2_1.unique()])

#2. Declare course key, scan table for course key and color with course.color
(build_comparison_DF(categories_DF).style
  .applymap(color_hand_classified_courses)
  .set_caption('Course Rank by Academic Discipline'))

Unnamed: 0_level_0,Humanities,Social sciences,Natural Sciences,Formal Sciences,Applied Sciences
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Computer Vision,Computer Vision,Computer Vision,Advanced Operating Systems,Advanced Operating Systems
2,Reinforcement Learning,Machine Learning,Machine Learning,Computer Vision,Computer Vision
3,Machine Learning,Reinforcement Learning,Reinforcement Learning,Machine Learning,Machine Learning
4,Advanced Operating Systems,Advanced Operating Systems,Advanced Operating Systems,Reinforcement Learning,Reinforcement Learning
5,Software Architecture and Design,Software Architecture and Design,Software Architecture and Design,Software Architecture and Design,Software Architecture and Design
6,Introduction to Information Security,Knowledge-Based Artificial Intelligence: Cognitive Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems,Introduction to Operating Systems,Introduction to Information Security
7,Knowledge-Based Artificial Intelligence: Cognitive Systems,Introduction to Information Security,Introduction to Information Security,Introduction to Information Security,Introduction to Operating Systems
8,Introduction to Operating Systems,Introduction to Operating Systems,Introduction to Operating Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems
9,Computational Photography,Computational Photography,Computational Photography,Embedded Systems Optimization,Intro to Cyber Physical Systems Security
10,Embedded Systems Optimization,Embedded Systems Optimization,Embedded Systems Optimization,Computational Photography,Embedded Systems Optimization


#### 5.2 Computer Science vs Computer Science classes

In [544]:
computer_science_interests_courses_keys = {
    'Formal Sciences > Computer Science > Logic in computer science':[],
       'Formal Sciences > Computer Science > Algorithms':['Intro to Graduate Algorithms'],
       'Formal Sciences > Computer Science > Artificial intelligence':['Artificial Intelligence', 'Artificial Intelligence for Robotics', 'Machine Learning', 'Knowledge-Based Artificial Intelligence: Cognitive Systems', 'Computer Vision', 'Computational Photography', 'Machine Learning for Trading', 'Reinforcement Learning'],
       'Formal Sciences > Computer Science > Data structures':['Database Systems Concepts and Design', 'Data and Visual Analytics'],
       'Formal Sciences > Computer Science > Computer architecture':['Intro to High-Performance Computing', 'High Performance Computer Architecture'],
       'Formal Sciences > Computer Science > Computer graphics':['Intro to High-Performance Computing', 'High Performance Computer Architecture'],
       'Formal Sciences > Computer Science > Computer communications (networks)':['Computer Networks', 'Network Security', 'Intro to Cyber Physical Systems Security'],
       'Formal Sciences > Computer Science > Computer security and reliability':['Network Security', 'Introduction to Information Security'],
       'Formal Sciences > Computer Science > Computing in mathematics, natural sciences, engineering, and medicine':['Data and Visual Analytics', 'Machine Learning', 'Intro to Health Informatics', 'Special Topics: Big Data for Health Informatics', 'Computational Photography'],
       'Formal Sciences > Computer Science > Computing in social sciences, arts, humanities, and professions':['Data and Visual Analytics', 'Educational Technology', 'Machine Learning for Trading'],
       'Formal Sciences > Computer Science > Distributed computing':['Intro to High-Performance Computing', 'High Performance Computer Architecture'],
       'Formal Sciences > Computer Science > Human-computer interaction':['Human-Computer Interaction'],
       'Formal Sciences > Computer Science > Operating systems':['Introduction to Operating Systems', 'Advanced Operating Systems', 'Embedded Systems Optimization'],
       'Formal Sciences > Computer Science > Parallel computing':['Intro to High-Performance Computing', 'High Performance Computer Architecture'],
       'Formal Sciences > Computer Science > Programming languages':['Compilers - Theory and Practice'],
       'Formal Sciences > Computer Science > Quantum computing':['Intro to High-Performance Computing', 'High Performance Computer Architecture'],
       'Formal Sciences > Computer Science > Software engineering':['Database Systems Concepts and Design', 'Software Analysis', 'Software Development Process', 'Software Architecture and Design'],
       'Formal Sciences > Computer Science > Theory of computation':['Intro to Graduate Algorithms'],
       'Formal Sciences > Computer Science > VLSI design':[]
}

def color_course_match(course):
    if course:
        color = 'blue'
    else: color = 'gainsboro'
    return 'background-color: %s' % color

def color_hand_classified_CS_course(series, course_key=computer_science_interests_courses_keys):
    """
    Takes a course and colors specific courses 
    according to a hand-created classification key, designed
    to compare broad sub-categories of Computer Science 
    across academic interests.
    """
    return series.apply(lambda course: color_course_match(course in course_key[series.name]))


In [545]:
#1. write Rank x Categories (values=Course) ; get computer science courses
tempdf = (wiki_page_path_DF.loc[(wiki_page_path_DF['sub_category_h3_2'] == 'Computer Science') 
                                    & (pd.notnull(wiki_page_path_DF['topic_3']))
                                    & (pd.isnull(wiki_page_path_DF['subtopic_4']))
                                    & (pd.isnull(wiki_page_path_DF['subtopic_5'])), ['page_path_label', 'sub_category_h3_2']])
tempdf = NB_model_recommendations_DF.transpose().join(tempdf.set_index('page_path_label'), how='inner')

(build_comparison_DF(tempdf.iloc[:, :-1].transpose()).style
  .apply(color_hand_classified_CS_course, axis=0))

Unnamed: 0_level_0,Formal Sciences > Computer Science > Logic in computer science,Formal Sciences > Computer Science > Algorithms,Formal Sciences > Computer Science > Artificial intelligence,Formal Sciences > Computer Science > Data structures,Formal Sciences > Computer Science > Computer architecture,Formal Sciences > Computer Science > Computer graphics,Formal Sciences > Computer Science > Computer communications (networks),Formal Sciences > Computer Science > Computer security and reliability,"Formal Sciences > Computer Science > Computing in mathematics, natural sciences, engineering, and medicine","Formal Sciences > Computer Science > Computing in social sciences, arts, humanities, and professions",Formal Sciences > Computer Science > Distributed computing,Formal Sciences > Computer Science > Human-computer interaction,Formal Sciences > Computer Science > Operating systems,Formal Sciences > Computer Science > Parallel computing,Formal Sciences > Computer Science > Programming languages,Formal Sciences > Computer Science > Quantum computing,Formal Sciences > Computer Science > Software engineering,Formal Sciences > Computer Science > Theory of computation,Formal Sciences > Computer Science > VLSI design
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,Computer Vision,Machine Learning,Machine Learning,Advanced Operating Systems,Advanced Operating Systems,Computer Vision,Advanced Operating Systems,Advanced Operating Systems,Computer Vision,Computer Vision,Advanced Operating Systems,Advanced Operating Systems,Advanced Operating Systems,Advanced Operating Systems,Advanced Operating Systems,Computer Vision,Advanced Operating Systems,Machine Learning,Advanced Operating Systems
2,Machine Learning,Reinforcement Learning,Computer Vision,Introduction to Operating Systems,Computer Vision,Machine Learning,Introduction to Information Security,Introduction to Information Security,Machine Learning,Reinforcement Learning,Machine Learning,Computer Vision,Introduction to Operating Systems,Introduction to Operating Systems,Computer Vision,Machine Learning,Machine Learning,Reinforcement Learning,Computer Vision
3,Advanced Operating Systems,Computer Vision,Reinforcement Learning,Embedded Systems Optimization,Machine Learning,Reinforcement Learning,Computer Vision,Computer Vision,Reinforcement Learning,Machine Learning,Introduction to Operating Systems,Machine Learning,Introduction to Information Security,Computer Vision,Machine Learning,Advanced Operating Systems,Computer Vision,Computer Vision,Reinforcement Learning
4,Reinforcement Learning,Advanced Operating Systems,Advanced Operating Systems,Introduction to Information Security,Introduction to Operating Systems,Advanced Operating Systems,Introduction to Operating Systems,Machine Learning,Advanced Operating Systems,Advanced Operating Systems,Computer Vision,Reinforcement Learning,Computer Vision,Machine Learning,Software Architecture and Design,Reinforcement Learning,Reinforcement Learning,Advanced Operating Systems,High Performance Computer Architecture
5,Software Architecture and Design,Software Architecture and Design,Software Architecture and Design,Software Architecture and Design,High Performance Computer Architecture,Computational Photography,Machine Learning,Reinforcement Learning,Software Architecture and Design,Software Architecture and Design,Reinforcement Learning,Introduction to Operating Systems,Reinforcement Learning,Reinforcement Learning,Reinforcement Learning,Introduction to Information Security,Software Architecture and Design,Software Architecture and Design,Introduction to Operating Systems
6,Knowledge-Based Artificial Intelligence: Cognitive Systems,Introduction to Operating Systems,Introduction to Information Security,Computer Vision,Software Architecture and Design,Software Architecture and Design,Software Architecture and Design,Introduction to Operating Systems,Introduction to Information Security,Introduction to Information Security,Software Architecture and Design,Software Architecture and Design,Software Architecture and Design,High Performance Computer Architecture,Introduction to Operating Systems,Introduction to Operating Systems,Introduction to Information Security,Introduction to Operating Systems,Machine Learning
7,Introduction to Operating Systems,Embedded Systems Optimization,Knowledge-Based Artificial Intelligence: Cognitive Systems,Machine Learning,Reinforcement Learning,Introduction to Operating Systems,Reinforcement Learning,Software Architecture and Design,Knowledge-Based Artificial Intelligence: Cognitive Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems,Introduction to Information Security,Introduction to Information Security,Machine Learning,Embedded Systems Optimization,Embedded Systems Optimization,Software Architecture and Design,Introduction to Operating Systems,Embedded Systems Optimization,Software Architecture and Design
8,Embedded Systems Optimization,Introduction to Information Security,Introduction to Operating Systems,Reinforcement Learning,Embedded Systems Optimization,Introduction to Information Security,High Performance Computer Architecture,High Performance Computer Architecture,Introduction to Operating Systems,Introduction to Operating Systems,Embedded Systems Optimization,Knowledge-Based Artificial Intelligence: Cognitive Systems,High Performance Computer Architecture,Software Architecture and Design,Introduction to Information Security,High Performance Computer Architecture,Software Development Process,Knowledge-Based Artificial Intelligence: Cognitive Systems,Embedded Systems Optimization
9,Introduction to Information Security,High Performance Computer Architecture,Computational Photography,High Performance Computer Architecture,Introduction to Information Security,Knowledge-Based Artificial Intelligence: Cognitive Systems,Embedded Systems Optimization,Intro to Cyber Physical Systems Security,Computational Photography,Computational Photography,High Performance Computer Architecture,Embedded Systems Optimization,Embedded Systems Optimization,Introduction to Information Security,High Performance Computer Architecture,Embedded Systems Optimization,Knowledge-Based Artificial Intelligence: Cognitive Systems,Compilers - Theory and Practice,Introduction to Information Security
10,Computational Photography,Knowledge-Based Artificial Intelligence: Cognitive Systems,Embedded Systems Optimization,Knowledge-Based Artificial Intelligence: Cognitive Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems,Embedded Systems Optimization,Intro to Cyber Physical Systems Security,Embedded Systems Optimization,Embedded Systems Optimization,Embedded Systems Optimization,Knowledge-Based Artificial Intelligence: Cognitive Systems,Computational Photography,Knowledge-Based Artificial Intelligence: Cognitive Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems,Compilers - Theory and Practice,Computational Photography,Embedded Systems Optimization,Introduction to Information Security,Knowledge-Based Artificial Intelligence: Cognitive Systems


### 6. Write to Firebase

In [565]:
NB_model_recommendations_DF.transpose().head()

Unnamed: 0,Advanced Operating Systems,Artificial Intelligence,Artificial Intelligence for Robotics,Compilers - Theory and Practice,Computational Photography,Computer Networks,Computer Vision,Data and Visual Analytics,Database Systems Concepts and Design,Educational Technology,...,Introduction to Operating Systems,Knowledge-Based Artificial Intelligence: Cognitive Systems,Machine Learning,Machine Learning for Trading,Network Security,Reinforcement Learning,Software Analysis,Software Architecture and Design,Software Development Process,Special Topics: Big Data for Health Informatics
Humanities,1722.122438,1251.437164,1037.181303,1229.673446,1435.274995,960.84118,1830.556682,1067.802884,872.778582,1107.401541,...,1449.831169,1493.285576,1749.614649,1083.116541,753.298401,1776.572551,1093.725937,1557.892282,1313.753108,825.041119
Humanities > Arts,1058.910115,708.274581,608.897499,739.667432,893.883162,608.054494,1111.23625,656.733216,572.013101,628.171349,...,878.64799,926.198122,1036.827434,624.984002,460.606751,1024.224636,667.928542,1008.670863,824.236759,524.759268
Humanities > Arts > Performing arts,1100.364319,854.051057,724.917759,793.458862,955.95969,629.607292,1195.001138,684.366248,572.519395,695.954905,...,916.772941,922.964844,1118.488228,697.753329,459.693847,1131.341669,698.01529,997.32448,829.48901,518.92323
Humanities > Arts > Performing arts > Music,2564.681676,1899.612527,1588.712557,1756.736859,2087.81828,1418.689472,2795.027216,1592.51832,1282.302824,1564.224838,...,2148.393741,2125.636376,2684.365897,1580.067638,1064.251656,2649.869952,1524.593478,2333.355633,1846.315912,1194.308608
Humanities > Arts > Performing arts > Music > Accompanying,400.373039,336.19209,258.911451,309.215035,331.653023,258.674577,414.225716,283.160684,235.569805,243.518261,...,354.246901,341.459584,426.970381,271.159244,202.680089,421.801035,287.107557,380.144514,312.987283,228.547703


In [613]:
UI_interests = OrderedDict({
    'Computer Science': {'Formal Sciences':['Computer Science']},
    'Math & Statistics': {'Formal Sciences':['Mathematics', 'Statistics']},
    'Engineering & Tech': {'Applied Sciences':['Engineering and technology']}, 
    'Business': 'Business',
    'Medicine & Health': {'Applied Sciences':['Medicine and health']},
    'Natural Sciences': 'Natural Sciences',
    'Social Sciences': 'Social sciences',
    'Humanities': 'Humanities'
})

worklist = []
# for i, (key, value) in enumerate(d.items())
for i, (interest_label, interest_id) in enumerate(UI_interests.items()):
    if isinstance(interest_id, dict):
        category_id = list(interest_id.values())[0]
        tempdf = pd.DataFrame(wiki_page_path_DF.loc[(wiki_page_path_DF.sub_category_h3_2.isin(list(interest_id.values())[0])), 'page_path_label'])
        course_recommendations = pd.merge(tempdf, NB_model_recommendations_DF.transpose(), left_on='page_path_label', right_index=True).drop(columns=['page_path_label']).mean().to_dict()
    elif interest_label == 'Business':
        category_id = [interest_id]
        tempdf = pd.DataFrame(wiki_page_path_DF.loc[(wiki_page_path_DF.sub_category_h3_2 == interest_id), 'page_path_label'])
        course_recommendations = pd.merge(tempdf, NB_model_recommendations_DF.transpose(), left_on='page_path_label', right_index=True).drop(columns=['page_path_label']).to_dict('records')[0]
    else:
        category_id = [interest_id]
        course_recommendations = NB_model_recommendations_DF.transpose().loc[category_id[0]].to_dict()
    #categoryLabel, categoryIDs, course_recommendations:{category:prediction}
    worklist.append([interest_label, category_id, course_recommendations])
    
workdf = pd.DataFrame(worklist, columns=['interest_label', 'category_id', 'course_recommendations'])
workdf

Unnamed: 0,interest_label,category_id,course_recommendations
0,Computer Science,[Computer Science],{'Advanced Operating Systems': 1417.8807057582...
1,Math & Statistics,"[Mathematics, Statistics]",{'Advanced Operating Systems': 1127.5880502859...
2,Engineering & Tech,[Engineering and technology],{'Advanced Operating Systems': 855.61499333125...
3,Business,[Business],{'Advanced Operating Systems': 1352.5196873264...
4,Medicine & Health,[Medicine and health],{'Advanced Operating Systems': 996.38773437711...
5,Natural Sciences,[Natural Sciences],{'Advanced Operating Systems': 2770.3182191087...
6,Social Sciences,[Social sciences],{'Advanced Operating Systems': 1902.0110827340...
7,Humanities,[Humanities],{'Advanced Operating Systems': 1722.1224383791...


In [None]:
# Some help from: https://medium.com/@hmurari/cloud-firestore-batch-transactions-how-to-migrate-a-large-amounts-of-data-336e61efbe7c
# Use the application default credentials
cred = credentials.ApplicationDefault()
firebase_admin.initialize_app(cred, {
  'projectId': 'courserec-adfe3',
})

db = firestore.client()

In [615]:
recommendations_collection = db.collection(u'recommendations')

#for batching
total = len(workdf)
counter = 0

# Start a batch
batch = db.batch()
for idx, record in workdf.iterrows():
    # Commit the batch at every 500th record.
    if counter % 500 == 0:
        if counter > 0:
            print('Committing..')
            batch.commit()

        # Start a new batch for the next iteration.
        batch = db.batch()
    counter += 1
    print(str(counter) + str('/') + str(total) + ': ' + str(record.interest_label))
    record_id = record.interest_label
    record_ref = recommendations_collection.document(str(record_id))
    # Include current record in batch
    batch.set(record_ref, 
        {
            "categoryLabel": record.interest_label,
            "courseRecommendations": record.course_recommendations
    })

# Include current record in batch
if counter % 500 != 0:
    print('Committing..')
    batch.commit()

1/8: Computer Science
2/8: Math & Statistics
3/8: Engineering & Tech
4/8: Business
5/8: Medicine & Health
6/8: Natural Sciences
7/8: Social Sciences
8/8: Humanities
Committing..


### 7. Future Directions
- Add superset categories to documents bag of words to increase coverage?
- Turn into model 
- Limit unrelated interests
- Use MMR to evaluate (can normalize)

In [None]:
class MN_NaiveBayes:

    """
    Constructor for MN_NaiveBayes.
    Initializes overall counts of positive, negative, and neutral classes.
    Initializes overall document count for use in a priori class probability
    calculation.
    Initializes pos, neg, and neutral feature count dictionaries.
    """
    def __init__(self, pos, neg):

    """
    An implementation of Jurafsky's MN Bayes Network
    algorithm.
    """
    def train(self):