# 2.4.v1 - Modeling - content_based_model - TF-IDF based approach with n-grams and Entity Recognition

In [204]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.csr import csr_matrix #need this if you want to save tfidf_matrix
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import linear_kernel

from collections import Counter, OrderedDict
import re
import ast
import math
import requests, json

# Load library
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download the set of stop words the first time
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/cesleemontgomery/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [267]:
wiki_page_path_DF = pd.read_csv('../../data/processed/academic_outline_wikipedia_pages.csv')
course_subs_DF = pd.read_csv('../../data/processed/course_data_udacity.csv')

In [3]:
words = set(nltk.corpus.words.words())

# Building function for easy comparison
def get_counter_from_list(templist, words=words):

    #lowercase,remove punctuation and non-alpha, split whitespace
    templist = " ".join(w for w in nltk.wordpunct_tokenize(str(templist)) \
         if w.lower() in words).lower()
    templist = re.sub("[^a-zA-Z\s]+", "", templist).split()

    # Load stop words, lemmatizer
    stop_words = stopwords.words('english')
    lemma = nltk.wordnet.WordNetLemmatizer()

    # Remove stop words, lowercase, lemmatize
    counter = Counter([lemma.lemmatize(word.lower()) for word in templist if word not in stop_words])
    return counter

### 1.0 - N-grams and Entity Recognition

Let's see if N-grams and wikipedia-based Entity Recognition can help our cause.

We want to transform this list of top occuring keywords into an ordered list of **meaningful** keywords.

In [4]:
tempname = 'Applied Sciences > Medicine and health'
tempdf = wiki_page_path_DF[wiki_page_path_DF.page_path_label == tempname].page_text.values

print("Most Common Words in '{}' course: \n".format(tempname))
for i in get_counter_from_list(tempdf, words).most_common(200): print(i)


Most Common Words in 'Applied Sciences > Medicine and health' course: 

('medicine', 229)
('medical', 116)
('health', 80)
('n', 69)
('care', 67)
('branch', 60)
('study', 38)
('treatment', 33)
('system', 33)
('disease', 31)
('surgery', 30)
('practice', 29)
('patient', 27)
('may', 26)
('diagnosis', 25)
('also', 25)
('science', 23)
('modern', 22)
('general', 20)
('knowledge', 19)
('history', 19)
('many', 19)
('concerned', 19)
('prevention', 18)
('traditional', 17)
('include', 17)
('physician', 16)
('based', 16)
('often', 15)
('first', 15)
('human', 15)
('clinical', 14)
('surgical', 14)
('used', 13)
('evidence', 13)
('examination', 13)
('one', 13)
('body', 13)
('research', 12)
('example', 12)
('world', 12)
('known', 11)
('e', 11)
('united', 11)
('emergency', 11)
('work', 11)
('training', 10)
('doctor', 10)
('development', 10)
('well', 10)
('laboratory', 10)
('primary', 10)
('function', 10)
('treat', 9)
('blood', 9)
('provided', 9)
('access', 9)
('life', 9)
('technology', 8)
('apply', 8)
('

### 1.1 - Get List of N-grams up to 4.

In [5]:
#1. get user model candidates
text = tempdf[0]
ngrams_vec = CountVectorizer(ngram_range=(1,4))
analyzer = ngrams_vec.build_analyzer()
ngrams = analyzer(text)
print(ngrams[0:100])

['medicine', 'is', 'the', 'science', 'and', 'practice', 'of', 'establishing', 'the', 'diagnosis', 'prognosis', 'treatment', 'and', 'prevention', 'of', 'disease', 'medicine', 'encompasses', 'variety', 'of', 'health', 'care', 'practices', 'evolved', 'to', 'maintain', 'and', 'restore', 'health', 'by', 'the', 'prevention', 'and', 'treatment', 'of', 'illness', 'contemporary', 'medicine', 'applies', 'biomedical', 'sciences', 'biomedical', 'research', 'genetics', 'and', 'medical', 'technology', 'to', 'diagnose', 'treat', 'and', 'prevent', 'injury', 'and', 'disease', 'typically', 'through', 'pharmaceuticals', 'or', 'surgery', 'but', 'also', 'through', 'therapies', 'as', 'diverse', 'as', 'psychotherapy', 'external', 'splints', 'and', 'traction', 'medical', 'devices', 'biologics', 'and', 'ionizing', 'radiation', 'amongst', 'others', 'medicine', 'has', 'been', 'around', 'for', 'thousands', 'of', 'years', 'during', 'most', 'of', 'which', 'it', 'was', 'an', 'art', 'an', 'area', 'of', 'skill']


In [105]:
len(ngrams)

36190

### 1.2 - Filter through wikipedia-based entity map

For time's sake a more efficient/quality way to check whether a word is an entity is to check it against Wikipedia titles.  This hack is likely to be a better bet because Wikipedia covers highly specific academic topics that might be hard to pull from existing named entity corpuses.

In [127]:
#1.1 some pre-processing

"""remove punctuation and non-alpha, split whitespace, lowercase, lemmatize"""
def preprocess_token(token):
    
        # Load stop words, lemmatizer
    stop_words = stopwords.words('english')
    lemma = nltk.wordnet.WordNetLemmatizer()

    processed_token = lemma.lemmatize( \
                            re.sub("[^a-zA-Z\s]+", "", token) \
                        .lower())
    if processed_token not in stop_words:
        return processed_token
    else:
        return None

ngrams_cleaned = set(filter(None, map(preprocess_token, ngrams)))
print(len(ngrams_cleaned), next(iter(ngrams_cleaned)))

23971 but the


In [10]:
#2. get wiki NER corpus
wiki_titles = set(token.lower() for token in open("../../data/raw/articletitles_wikipedia/enwiki-latest-all-titles-in-ns0.txt", "r") \
                .read().splitlines())
len(wiki_titles)

13870304

In [195]:
def is_not_type_disambiguation_page(title):
    url = """https://en.wikipedia.org/api/rest_v1/page/summary/{title}""".format(title=title)

    response = requests.get(url)
    if response.status_code == 200:
        data = json.loads(response.text)
        if data['type'] != 'disambiguation':
            return True
        else:
            return False

# tempset = set(map(lambda token: token.replace(" ", "_"), ngrams_cleaned))
# ngrams_search_patterns = tempset | set('{token}_(computing)'.format(token=g) for g in ngrams_search_patterns)
# ngrams_search_patterns_matches = set.intersection(ngrams_search_patterns, wiki_titles)
ngrams_search_patterns_matches_nondisambiguated = filter(lambda title: is_not_type_disambiguation_page(title), ngrams_search_patterns_matches)
user_model_entities = set(map(lambda token: token.replace("_(computing)", "").replace("_", " "), ngrams_search_patterns_matches_nondisambiguated))

Let's see how we did by filtering out ngrams given identification as a wikipedia-based entity...

In [141]:
len(ngrams_search_patterns_matches)

2750

In [156]:
ngrams_search_patterns_matches

{'ethnography',
 'prescribing',
 'great',
 'mixed',
 'tension',
 'visual arts',
 'developed',
 'bc',
 'visceral',
 'take care',
 'separation',
 'sens',
 'herbal medicines',
 'pharmacopoeia',
 'hobby',
 'from one',
 'for example',
 'whole',
 'action',
 'indicator',
 'family members',
 'highly',
 'design',
 'certain',
 'textbook',
 'to each',
 'said',
 'application',
 'healthcare provider',
 'continue',
 'vascular',
 'podiatric',
 'sampling',
 'cardiology',
 'lutz',
 'the canon of medicine',
 'for all',
 'mechanic',
 'luria',
 'the study',
 'weight',
 'endoscopy',
 'flying',
 'nafis',
 'the digestive system',
 'bimaristan',
 'medieval islamic medicine',
 'the black',
 'word',
 'emission computed tomography',
 'gain',
 'science and technology',
 'all ages',
 'possession',
 'impacted',
 'humani',
 'given',
 'evolve',
 'unnecessary health care',
 'religion',
 'overtone',
 'history',
 'ordering',
 'radiopharmaceutical',
 'the skin',
 'evolution',
 'modern era',
 'weapon',
 'plant',
 'elderly

Not bad... I can tell there are still some pages that don't look right like "whole"... can we remove these likely disambiguation pages?

In [193]:
len(ngrams_search_patterns_matches_nondisambiguated)

1705

In [196]:
ngrams_search_patterns_matches_nondisambiguated

{'ethnography',
 'visual arts',
 'developed',
 'visceral',
 'sens',
 'herbal medicines',
 'pharmacopoeia',
 'hobby',
 'family members',
 'design',
 'certain',
 'textbook',
 'said',
 'application',
 'healthcare provider',
 'vascular',
 'podiatric',
 'cardiology',
 'lutz',
 'for all',
 'mechanic',
 'weight',
 'endoscopy',
 'nafis',
 'bimaristan',
 'medieval islamic medicine',
 'word',
 'emission computed tomography',
 'science and technology',
 'all ages',
 'unnecessary health care',
 'religion',
 'overtone',
 'history',
 'radiopharmaceutical',
 'evolution',
 'modern era',
 'weapon',
 'plant',
 'elderly',
 'chemical components',
 'and how',
 'biopsy',
 'defines',
 'thinking',
 'there is',
 'burn',
 'nation',
 'philosophical',
 'hyperbaric medicine',
 'usually',
 'medica',
 'royal college',
 'van',
 'climate',
 'crisis',
 'prevention of disease',
 'intensive care',
 'brazil',
 'emotional',
 'epistemological',
 'well',
 'take',
 'surgical',
 'decade',
 'intentionally',
 'program',
 'learni

I feel pretty good about this!! Removes a lot of words that users might consider random if they showed up alongside recommendations.  Again this is important in order to gain their confidence in the system's recommendations.

Let's move on.

### 2.0 - TF-IDF of user model wiki-based entities

In [199]:
len(user_model_entities)

1693

In [224]:
#1. use vocabulary user_model_entities to get counts for all user model documents (academic interests)
vectorizer = CountVectorizer(vocabulary=list(user_model_entities), ngram_range=(1,4))
ngram_counts = vectorizer.fit_transform(wiki_page_path_DF.page_text.astype('U'))
#2. get TF-IDF weights
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(ngram_counts)

In [252]:
tfidif_df = pd.DataFrame(tfidf_matrix.toarray(), index=wiki_page_path_DF.page_path_label, columns = vectorizer.get_feature_names())
tfidif_df.head()

Unnamed: 0_level_0,ethnography,visual arts,developed,visceral,sens,herbal medicines,pharmacopoeia,hobby,family members,design,...,reproductive medicine,office,world health organization,ankle,hand,healthcare system,the wild,chain,conducted,fungi
page_path_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Humanities,0.0,0.044866,0.009191,0.0,0.0,0.0,0.0,0.0,0.0,0.007141,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Humanities > Arts,0.0,0.166435,0.017047,0.0,0.0,0.0,0.0,0.0,0.0,0.132454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Humanities > Arts > Performing arts,0.0,0.043364,0.039974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018858,0.0,0.0,0.0,0.0,0.0
Humanities > Arts > Performing arts > Music,0.0,0.0,0.034976,0.0,0.0,0.0,0.0,0.019613,0.0,0.0,...,0.0,0.0,0.0,0.0,0.028286,0.0,0.0,0.011266,0.0,0.0
Humanities > Arts > Performing arts > Music > Accompanying,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.068263,0.0,0.0,0.0,0.0,0.0


In [256]:
np.shape(tfidf_matrix)

(985, 1693)

In [266]:
test_index = 893
cosine_similarities = linear_kernel(tfidf_matrix[test_index:test_index+1], tfidf_matrix).flatten()

In [263]:
related_docs_indices = cosine_similarities.argsort()[:-10:-1];related_docs_indices

array([893, 732, 926, 981, 894, 971, 689, 284, 443])

In [264]:
cosine_similarities[related_docs_indices]

array([1.        , 0.69430095, 0.57065178, 0.56012659, 0.55979972,
       0.50258978, 0.49716443, 0.44439938, 0.44340904])

In [265]:
df.index[related_docs_indices]

Index(['Applied Sciences > Medicine and health',
       'Formal Sciences > Computer Science > Computing in mathematics, natural sciences, engineering, and medicine',
       'Applied Sciences > Medicine and health > Internal medicine',
       'Applied Sciences > Medicine and health > Traditional medicine',
       'Applied Sciences > Medicine and health > Alternative medicine',
       'Applied Sciences > Medicine and health > Sports medicine',
       'Natural Sciences > Physics > Medical physics',
       'Social sciences > Human geography > Health geography',
       'Social sciences > Sociology > Medical sociology'],
      dtype='object', name='page_path_label')

Cool beans! There's some sense to this TF-IDF because it figues that this top ten list of interests (wikipedia articles) are related! 

Now very importantly, what about courses? Let's try to find matches for our test interest = **'Applied Sciences > Medicine and health'**

In [283]:
templist = [wiki_page_path_DF[wiki_page_path_DF.page_path_label == tempname].page_text.values[0]]
templist.extend(course_subs_DF.subtitles.astype('U').values)

In [287]:
#1. use vocabulary user_model_entities to get counts for all user model documents (academic interests)
vectorizer = CountVectorizer(vocabulary=list(user_model_entities), ngram_range=(1,4))
ngram_counts = vectorizer.fit_transform(templist)
#2. get TF-IDF weights
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(ngram_counts)

In [298]:
test_index = 0
cosine_similarities = linear_kernel(tfidf_matrix[test_index:test_index+1], tfidf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[:-5:-1];related_docs_indices
scores = cosine_similarities[related_docs_indices[1:]]
course_labels = course_subs_DF.loc[[idx-1 for idx in related_docs_indices[1:]], 'course_name']
print(list(zip(course_labels, scores)))

[('Intro to Health Informatics', 0.3625228519408706), ('Special Topics: Big Data for Health Informatics', 0.19956110155206086), ('Software Development Process', 0.08483674228283665)]


Holy crap!!!  That seems to have worked sort of well.  Let's clean this up and run it for real.