In [58]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from collections import Counter

# Load library
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download the set of stop words the first time
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
import re

import math

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package words is already up-to-date!


# 2.3 - Modeling - content_based_model - Investigate the viability of the Mooney et al. text categorization ML approach.

### 1. Extract Data from Corpuses: Courses - Video Subtitles (Udacity), User Interests - Academic-related Articles (Wikipedia)

In [3]:
course_subs_DF = pd.read_csv('../../data/raw/videosubtitles_udacity/course_video_subtitles_udacity.csv')
wiki_page_path_DF = pd.read_csv('../../data/raw/articles_wikipedia/academic_outline_wikipedia_pages.csv')

In [7]:
print("Data Shapes: \n")
print("Course - Subtitles: ", course_subs_DF.shape)
print("User - Academic Articles", wiki_page_path_DF.shape)

Data Shapes: 

Course - Subtitles:  (30, 2)
User - Academic Articles (985, 7)


### 2. Pre-Processing

### 3. Calculate TF-IDF to see if keywords emerge.

### 4. Naive Bayes implementation

#### 4.1 Build table of P( word | Course ) words in Vocabulary x Courses*

*\*prior calculation not required since it won't affect rank of results.*

In [22]:
# Building function for easy comparison
def get_counter_from_list(templist, words):

    #lowercase,remove punctuation and non-alpha, split whitespace
    templist = " ".join(w for w in nltk.wordpunct_tokenize(str(templist)) \
         if w.lower() in words).lower()
    templist = re.sub("[^a-zA-Z\s]+", "", templist).split()

    # Load stop words, lemmatizer
    stop_words = stopwords.words('english')
    lemma = nltk.wordnet.WordNetLemmatizer()

    # Remove stop words, lowercase, lemmatize
    counter = Counter([lemma.lemmatize(word.lower()) for word in templist if word not in stop_words])
    return counter

In [50]:
words = set(nltk.corpus.words.words())

#create vocab from course subtitles
vocabulary = set()

work_df = course_subs_DF.copy()
work_df['subtitles_word_freq'] = pd.Series(dtype=object)

#1. Calc word frequencies for each course
for idx, course in work_df.iterrows():
    print(course.course_slug)
    #add Counter to df
    word_freqs = get_counter_from_list(course.subtitles, words)
    print(word_freqs.most_common(10), '\n\n')
    
    vocabulary.update(list(word_freqs.keys()))
    work_df.at[idx,'subtitles_word_freq'] = word_freqs
    
#calc likelihoods for each word | course
NB_model_features_DF = pd.DataFrame(index=sorted(vocabulary), columns=course_subs_DF.course_slug)

#2. log likelihood = log of rel. word frequency
    # count of occurences + 1 / total words + Vocab (laplace smoothing)
for idx, course in work_df.iterrows():
    word_freqs = course.subtitles_word_freq
    for word, count in word_freqs.items():
        logLikelihood = math.log(count + 1 / float(sum(word_freqs.values()) + len(vocabulary)))
        NB_model_features_DF.at[word, course.course_slug] = logLikelihood

Artificial+Intelligence+for+Robotics+
[('de', 2895), ('la', 2231), ('e', 1711), ('x', 1243), ('el', 1079), ('en', 850), ('robot', 787), ('un', 678), ('se', 656), ('para', 594)] 


High Performance Computing
[('one', 601), ('time', 523), ('n', 470), ('algorithm', 375), ('two', 374), ('p', 302), ('want', 282), ('first', 268), ('vertex', 265), ('memory', 262)] 


Computability, Complexity & Algorithms
[('one', 571), ('well', 450), ('vertex', 345), ('time', 339), ('machine', 338), ('two', 305), ('problem', 305), ('x', 301), ('first', 286), ('set', 284)] 


Machine Learning
[('de', 6329), ('e', 4809), ('um', 2830), ('right', 2600), ('going', 2438), ('one', 2274), ('x', 2161), ('se', 2113), ('thats', 2005), ('para', 1941)] 


Database+Systems+Concepts+&+Design+
[('user', 599), ('one', 445), ('look', 277), ('data', 267), ('going', 217), ('table', 214), ('regular', 213), ('first', 210), ('two', 207), ('result', 204)] 


Knowledge-Based AI_ Cognitive Systems
[('one', 660), ('knowledge', 660), (

In [73]:
NB_model_features_DF.head(20)

course_slug,Artificial+Intelligence+for+Robotics+,High Performance Computing,"Computability, Complexity & Algorithms",Machine Learning,Database+Systems+Concepts+&+Design+,Knowledge-Based AI_ Cognitive Systems,Educational Technology,Introduction to Operating Systems,Computer Networking,CSE+8803+Special+Topics_+Big+Data+,...,GT - Embedded Systems,Computational Photography,Machine Learning for Trading,Reinforcement Learning,Machine Learning_ Unsupervised Learning,Human-Computer Interaction,Software Development Process,Cyber-Physical Systems Security,Introduction to Graduate Algorithms,Software Architecture & Design
aa,,,0.693158,1.60944,,1.09862,,,,,...,,,,0.69315,,,,,,
aba,,,,5.41326e-06,,,,,,,...,,,,,,,,,,
abandon,,,,,,1.3863,,,,,...,,,,,,,,,,
abandoned,1.48167e-05,,,,,,,,,,...,,,,,,,,,,
abb,,,,,,,,,,,...,,,,,,,,,,
abbreviate,,,,,,,,,,,...,,,,,,,,,,1.43136e-05
abbreviation,,,,0.69315,,,,,,,...,,,,,,,,,,1.3863
abdomen,,,,,,,,,,,...,,,,,,,,,,
abduce,,,,,,0.693155,,,,,...,,,,,,,,,,
abduction,,,,,,3.04452,,,,,...,,,,,,,,,,


#### 4.2 Classifier func given feature table

In [129]:
def classifier(documents, model):
    course_posteriors_DF = []
    
    if documents:
        for doc_name, doc_words in documents.items():
            model_subset_words = model[model.index.isin(doc_words)]
            logPosteriors = model_subset_words.sum(0)
            logPosteriors.rename(doc_name, inplace=True)
            course_posteriors_DF.append(logPosteriors)
        return pd.concat(course_posteriors_DF).transpose().sort_values(ascending=False)

In [131]:
model = NB_model_features_DF
tempdict = get_counter_from_list(wiki_page_path_DF[wiki_page_path_DF['category_h2_1'] == 'Formal Sciences'].page_text.values, words)

print('Formal Sciences Matches: \n', classifier({'Formal Sciences':list(tempdict.keys())}, model))

Formal Sciences Matches: 
 course_slug
Introduction to Computer Vision            6737.941016
Machine Learning                           6491.038013
Reinforcement Learning                     6378.698345
Advanced Operating Systems                 6247.851620
Software Architecture & Design             5028.859991
Intro to Information Security              4991.067519
Introduction to Operating Systems          4803.783494
Knowledge-Based AI_ Cognitive Systems      4568.264911
GT - Embedded Systems                      4549.407639
Computational Photography                  4538.386974
High Performance Computer Architecture     4480.226009
Artificial Intelligence                    4446.548302
Introduction to Graduate Algorithms        3868.429664
Compilers_ Theory and Practice             3809.439464
Software Development Process               3753.643541
High Performance Computing                 3686.854293
Cyber-Physical Systems Security            3623.517004
Artificial+Intelligence+fo

Scary believable :DDDD

#### 4.3 Pre-compute table of P( Course | Interests ) 

### 5. Future Directions
- Add superset categories to documents bag of words to increase coverage?
- Turn into model 

In [None]:
class MN_NaiveBayes:

    """
    Constructor for MN_NaiveBayes.
    Initializes overall counts of positive, negative, and neutral classes.
    Initializes overall document count for use in a priori class probability
    calculation.
    Initializes pos, neg, and neutral feature count dictionaries.
    """
    def __init__(self, pos, neg):

    """
    An implementation of Jurafsky's MN Bayes Network
    algorithm.
    """
    def train(self):

In [None]:
print(course.category_h2_1,'>', course.sub_category_h3_2,'>', interest.topic_3,'>', interest.subtopic_4,'>', interest.subtopic_5)

