In [58]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from collections import Counter

# Load library
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download the set of stop words the first time
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
import re

import math

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package words is already up-to-date!


# 2.3 - Modeling - content_based_model - Investigate the viability of the Mooney et al. text categorization ML approach.

### 1. Extract Data from Corpuses: Courses - Video Subtitles (Udacity), User Interests - Academic-related Articles (Wikipedia)

In [3]:
course_subs_DF = pd.read_csv('../../data/raw/videosubtitles_udacity/course_video_subtitles_udacity.csv')
wiki_page_path_DF = pd.read_csv('../../data/raw/articles_wikipedia/academic_outline_wikipedia_pages.csv')

In [7]:
print("Data Shapes: \n")
print("Course - Subtitles: ", course_subs_DF.shape)
print("User - Academic Articles", wiki_page_path_DF.shape)

Data Shapes: 

Course - Subtitles:  (30, 2)
User - Academic Articles (985, 7)


### 2. Pre-Processing

### 3. Calculate TF-IDF to see if keywords emerge.

### 4. Naive Bayes implementation

#### 4.1 Build table of P( word | Course ) words in Vocabulary x Courses*

*\*prior calculation not required since it won't affect rank of results.*

In [22]:
# Building function for easy comparison
def get_counter_from_list(templist, words):

    #lowercase,remove punctuation and non-alpha, split whitespace
    templist = " ".join(w for w in nltk.wordpunct_tokenize(str(templist)) \
         if w.lower() in words).lower()
    templist = re.sub("[^a-zA-Z\s]+", "", templist).split()

    # Load stop words, lemmatizer
    stop_words = stopwords.words('english')
    lemma = nltk.wordnet.WordNetLemmatizer()

    # Remove stop words, lowercase, lemmatize
    counter = Counter([lemma.lemmatize(word.lower()) for word in templist if word not in stop_words])
    return counter

In [50]:
words = set(nltk.corpus.words.words())

#create vocab from course subtitles
vocabulary = set()

work_df = course_subs_DF.copy()
work_df['subtitles_word_freq'] = pd.Series(dtype=object)

#1. Calc word frequencies for each course
for idx, course in work_df.iterrows():
    print(course.course_slug)
    #add Counter to df
    word_freqs = get_counter_from_list(course.subtitles, words)
    print(word_freqs.most_common(10), '\n\n')
    
    vocabulary.update(list(word_freqs.keys()))
    work_df.at[idx,'subtitles_word_freq'] = word_freqs
    
#calc likelihoods for each word | course
NB_model_features_DF = pd.DataFrame(index=sorted(vocabulary), columns=course_subs_DF.course_slug)

#2. log likelihood = log of rel. word frequency
    # count of occurences + 1 / total words + Vocab (laplace smoothing)
for idx, course in work_df.iterrows():
    word_freqs = course.subtitles_word_freq
    for word, count in word_freqs.items():
        logLikelihood = math.log(count + 1 / float(sum(word_freqs.values()) + len(vocabulary)))
        NB_model_features_DF.at[word, course.course_slug] = logLikelihood

Artificial+Intelligence+for+Robotics+
[('de', 2895), ('la', 2231), ('e', 1711), ('x', 1243), ('el', 1079), ('en', 850), ('robot', 787), ('un', 678), ('se', 656), ('para', 594)] 


High Performance Computing
[('one', 601), ('time', 523), ('n', 470), ('algorithm', 375), ('two', 374), ('p', 302), ('want', 282), ('first', 268), ('vertex', 265), ('memory', 262)] 


Computability, Complexity & Algorithms
[('one', 571), ('well', 450), ('vertex', 345), ('time', 339), ('machine', 338), ('two', 305), ('problem', 305), ('x', 301), ('first', 286), ('set', 284)] 


Machine Learning
[('de', 6329), ('e', 4809), ('um', 2830), ('right', 2600), ('going', 2438), ('one', 2274), ('x', 2161), ('se', 2113), ('thats', 2005), ('para', 1941)] 


Database+Systems+Concepts+&+Design+
[('user', 599), ('one', 445), ('look', 277), ('data', 267), ('going', 217), ('table', 214), ('regular', 213), ('first', 210), ('two', 207), ('result', 204)] 


Knowledge-Based AI_ Cognitive Systems
[('one', 660), ('knowledge', 660), (

In [73]:
NB_model_features_DF.head(20)

course_slug,Artificial+Intelligence+for+Robotics+,High Performance Computing,"Computability, Complexity & Algorithms",Machine Learning,Database+Systems+Concepts+&+Design+,Knowledge-Based AI_ Cognitive Systems,Educational Technology,Introduction to Operating Systems,Computer Networking,CSE+8803+Special+Topics_+Big+Data+,...,GT - Embedded Systems,Computational Photography,Machine Learning for Trading,Reinforcement Learning,Machine Learning_ Unsupervised Learning,Human-Computer Interaction,Software Development Process,Cyber-Physical Systems Security,Introduction to Graduate Algorithms,Software Architecture & Design
aa,,,0.693158,1.60944,,1.09862,,,,,...,,,,0.69315,,,,,,
aba,,,,5.41326e-06,,,,,,,...,,,,,,,,,,
abandon,,,,,,1.3863,,,,,...,,,,,,,,,,
abandoned,1.48167e-05,,,,,,,,,,...,,,,,,,,,,
abb,,,,,,,,,,,...,,,,,,,,,,
abbreviate,,,,,,,,,,,...,,,,,,,,,,1.43136e-05
abbreviation,,,,0.69315,,,,,,,...,,,,,,,,,,1.3863
abdomen,,,,,,,,,,,...,,,,,,,,,,
abduce,,,,,,0.693155,,,,,...,,,,,,,,,,
abduction,,,,,,3.04452,,,,,...,,,,,,,,,,


#### 4.2 Classifier func given feature table

In [174]:
def classifier(documents, model, transpose_and_sort=True):
    course_posteriors_DF = []
    
    if documents:
        for doc_name, doc_words in documents.items():
            model_subset_words = model[model.index.isin(doc_words)]
            logPosteriors = model_subset_words.sum(0)
            logPosteriors.rename(doc_name, inplace=True)
            course_posteriors_DF.append(logPosteriors)
        if transpose_and_sort:
            return pd.concat(course_posteriors_DF).transpose().sort_values(ascending=False)
        else:
            return pd.concat(course_posteriors_DF)

In [131]:
model = NB_model_features_DF
tempdict = get_counter_from_list(wiki_page_path_DF[wiki_page_path_DF['category_h2_1'] == 'Formal Sciences'].page_text.values, words)

print('Formal Sciences Matches: \n', classifier({'Formal Sciences':list(tempdict.keys())}, model))

Formal Sciences Matches: 
 course_slug
Introduction to Computer Vision            6737.941016
Machine Learning                           6491.038013
Reinforcement Learning                     6378.698345
Advanced Operating Systems                 6247.851620
Software Architecture & Design             5028.859991
Intro to Information Security              4991.067519
Introduction to Operating Systems          4803.783494
Knowledge-Based AI_ Cognitive Systems      4568.264911
GT - Embedded Systems                      4549.407639
Computational Photography                  4538.386974
High Performance Computer Architecture     4480.226009
Artificial Intelligence                    4446.548302
Introduction to Graduate Algorithms        3868.429664
Compilers_ Theory and Practice             3809.439464
Software Development Process               3753.643541
High Performance Computing                 3686.854293
Cyber-Physical Systems Security            3623.517004
Artificial+Intelligence+fo

Scary believable :DDDD

#### 4.3 Pre-compute table of P( Course | Interests ) 

In [180]:
work_list = []

model = NB_model_features_DF

print('Pre-computing course rankings for each interest.')
for idx, interest in wiki_page_path_DF.iterrows():
    workdict = get_counter_from_list(wiki_page_path_DF.iloc[idx].page_text, words)
    interest_name = " > ".join(list(filter(pd.notnull, [interest.category_h2_1, interest.sub_category_h3_2, interest.topic_3, interest.subtopic_4,interest.subtopic_5])))
    print(interest_name)
    work_list.append(classifier({interest_name:list(workdict.keys())}, model))

NB_model_recommendations_DF = pd.concat(work_list, axis=1); #display(NB_model_recommendations_DF.transpose())

Pre-computing course rankings for each interest.
Humanities
Humanities > Arts
Humanities > Arts > Performing arts
Humanities > Arts > Performing arts > Music
Humanities > Arts > Performing arts > Music > Accompanying
Humanities > Arts > Performing arts > Music > Chamber music
Humanities > Arts > Performing arts > Music > Church music
Humanities > Arts > Performing arts > Music > Conducting
Humanities > Arts > Performing arts > Music > Early music
Humanities > Arts > Performing arts > Music > Jazz studies
Humanities > Arts > Performing arts > Music > Musical composition
Humanities > Arts > Performing arts > Music > Music education
Humanities > Arts > Performing arts > Music > Music history
Humanities > Arts > Performing arts > Music > Musicology
Humanities > Arts > Performing arts > Music > Ethnomusicology
Humanities > Arts > Performing arts > Music > Music theory
Humanities > Arts > Performing arts > Music > Orchestral studies
Humanities > Arts > Performing arts > Music > Organology
Hu

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


In [182]:
NB_model_recommendations_DF.transpose().tail(20)

Unnamed: 0,Advanced Operating Systems,Artificial Intelligence,Artificial+Intelligence+for+Robotics+,CS 6340_ Software Analysis & Testing,CSE+8803+Special+Topics_+Big+Data+,Compilers_ Theory and Practice,"Computability, Complexity & Algorithms",Computational Photography,Computer Networking,Cyber-Physical Systems Security,...,Introduction to Graduate Algorithms,Introduction to Health Informatics,Introduction to Operating Systems,Knowledge-Based AI_ Cognitive Systems,Machine Learning,Machine Learning for Trading,Machine Learning_ Unsupervised Learning,Reinforcement Learning,Software Architecture & Design,Software Development Process
Applied Sciences > Medicine and health > Radiology,1215.323109,875.092658,738.36783,790.074039,656.198331,851.135576,750.730699,1062.89208,741.84172,942.305354,...,775.204298,490.036969,1058.922184,1014.362409,1181.988501,782.199421,622.644546,1155.222479,1069.334967,878.850769
Applied Sciences > Medicine and health > Recreational therapy,558.505184,388.227082,306.052537,339.764649,254.506361,370.088919,337.574141,391.313894,330.659959,417.92519,...,357.968097,221.972107,483.606905,457.113821,479.71796,336.036987,243.985641,495.015252,482.131066,398.738016
Applied Sciences > Medicine and health > Rehabilitation medicine,430.141937,351.589408,266.937639,278.356958,281.183987,324.094779,290.304766,335.742262,269.187077,328.225081,...,304.161398,215.171,387.426274,378.051816,439.52242,272.471967,256.061875,440.977288,380.802041,333.401956
Applied Sciences > Medicine and health > Respiratory therapy,947.095749,629.34232,525.211744,564.037753,492.464138,661.545918,581.693522,671.432758,557.160755,739.363004,...,582.835017,430.466629,882.653106,770.673389,855.850581,550.945864,446.230018,887.472102,866.193027,719.316017
Applied Sciences > Medicine and health > Sleep medicine,1142.238155,827.802251,683.126577,742.373766,617.887013,806.772023,718.892462,901.592728,693.519254,842.002714,...,770.069207,497.079801,1010.956045,965.341268,1123.985914,725.075944,601.644423,1114.777262,1067.983451,866.772099
Applied Sciences > Medicine and health > Speech-language pathology,956.123519,643.817358,524.759634,568.911047,505.053871,634.241231,536.681923,635.245466,545.660044,690.779952,...,530.514929,380.850271,830.459325,761.241747,827.672968,540.41806,424.142339,837.67768,829.890245,689.644999
Applied Sciences > Medicine and health > Sports medicine,366.026428,255.382203,187.366831,200.087031,227.275683,237.691824,199.84948,272.99125,227.607755,303.462767,...,201.79586,212.220592,315.832601,304.30161,325.196914,225.905341,162.212581,322.370092,335.158236,282.420318
Applied Sciences > Medicine and health > Surgery,1584.878624,1167.321797,987.757949,1007.082363,846.494709,1177.387993,982.667579,1267.147248,951.850696,1153.711204,...,1042.006352,625.522819,1376.582997,1342.9824,1550.948604,989.79268,796.038865,1523.579136,1440.038614,1178.762692
Applied Sciences > Medicine and health > Surgery > Bariatric surgery,1127.507056,892.135945,739.99256,755.974827,633.321003,842.626043,748.683767,918.844706,733.088618,835.69923,...,826.843473,471.211615,1036.933856,963.277785,1152.505168,768.226593,602.350808,1157.911075,1020.201825,835.013395
Applied Sciences > Medicine and health > Surgery > Cardiothoracic surgery,1114.924314,919.446464,741.39757,728.530392,642.125953,823.058301,731.485473,898.663056,717.718079,810.23494,...,805.56204,448.530202,1010.485996,910.584081,1138.079371,736.820045,598.22577,1132.960836,1002.122596,835.775437


#### 4.4 Extract Explanations

#### 4.5 Scenarios

This is the closest we'll come to evaluation since we don't have historical user data.  Let's compare manually to other sane baselines.

#### 4.5.1 Categories Scenarios (w/Explanations) (Comparison to Random or TF-IDF)
#### 4.5.2 Computer Science vs Computer Science classes
### 5. Write to Firebase

### 6. Future Directions
- Add superset categories to documents bag of words to increase coverage?
- Turn into model 
- Limit unrelated interests

In [None]:
class MN_NaiveBayes:

    """
    Constructor for MN_NaiveBayes.
    Initializes overall counts of positive, negative, and neutral classes.
    Initializes overall document count for use in a priori class probability
    calculation.
    Initializes pos, neg, and neutral feature count dictionaries.
    """
    def __init__(self, pos, neg):

    """
    An implementation of Jurafsky's MN Bayes Network
    algorithm.
    """
    def train(self):