In [None]:
# Not re-inventing the wheel.
# Copied from edison project.

In [1]:
import os
import spacy

from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_lg')

## Objective

Take something like:

1. Learn Computer Science With JavaScript: Part 4, Functions
1. Create Interactive Charts Using Plotly.js, Part 1: Getting Started

And come out with keywords:

1. Computer Science
1. JavaScript
1. Functions
1. Interactive Charts
1. Plotly.js

## Acceptance Criteria

It is okay for the extractor to extract things that are not exactly concepts. For example, in the case above, it might also come up with:

1. Getting Started

Another classifier can use Word2Vec to exclude generic terms, but it's not in the scope of this extractor, because we're only looking at the grammatical structure.

## Core idea

Encode every token inside a title as:

1. The tag of the parent
1. The dependency to the parent
1. ...

### Exploration

In [3]:
doc = nlp('Create Interactive Charts Using Plotly.js, Part 1: Getting Started')
displacy.render(doc, jupyter=True)

  "__main__", mod_spec)
  "__main__", mod_spec)


In [5]:
def test_features(token):
    print("Token:", token)
    print("Token tag:", token.tag_)
    print("Token parent dep:", token.dep_)
    print("Token parent tag:", token.head.tag_)
    print("Children:")

In [6]:
test_features(doc[1])

Token: Interactive
Token tag: NNP
Token parent dep: compound
Token parent tag: NNS
Children:


In [12]:
test_features(doc[2])

Token: Charts
Token tag: NNS
Token parent dep: dobj
Token parent tag: VB


#### ---------------- Here, it ends the concept extraction login ---------------- ####

### Following is the logic to extract concepts from all the html files ------

In [8]:
for np in doc.noun_chunks:
    print(np.text, np.root.dep_, np.root.head.text)

Interactive Charts dobj Create
Plotly.js dobj Using
Part appos Charts


In [54]:
import os
### Step 1: Import necessary packages here ###

import csv
import pandas
import numpy

# feedparser helps to xml to hash
# Install: conda install feedparser
import feedparser

# BeautifulSoup helps to grab text out of html
# Install: conda install beautifulsoup4
from bs4 import BeautifulSoup

import json
import urllib3

from collections import Counter

import newspaper
from newspaper import Article
from pandas import read_csv
from lxml import html
import requests

In [62]:
def clean_me(html):
    soup = BeautifulSoup(html, "lxml")
    for s in soup(['script', 'style']):
        s.decompose()
    return ' '.join(soup.stripped_strings)

In [63]:

html_files = os.listdir("Machine Learning Mastery articles")
html_file  = html_files[0]

html_file

keywords = list()

with open("Machine Learning Mastery articles/" + html_file) as f:
    html = f.read()
    scrubbed_content = clean_me(html)
    current_doc = nlp(scrubbed_content)

    for np in current_doc.noun_chunks:
        if np.root.dep_ in ['pobj', 'nsubj','compound'] and np.root.tag_ == 'NNP' and np.root.pos_ == 'PROPN':
            keywords.append(np.text)

#         if np.root.dep_ == 'nsubj' and np.root.tag_ == 'NNP' and np.root.pos_ == 'PROPN':
#             keywords.append(np.text)
#         if np.root.dep_ == 'compound' and np.root.tag_ == 'NNP' and np.root.pos_ == 'PROPN':
#             keywords.append(np.text)
#         if np.root.dep_ == 'dobj' and np.root.tag_ == 'NN' and np.root.pos_ == 'NOUN':
#             keywords.append(np.text)
        print(np.text + " dep: " + np.root.dep_ + " with tag: " + np.root.tag_ + " root.head.text: " + np.root.head.text + " POS: " + np.root.pos_)

Introduction dep: ROOT with tag: NN root.head.text: Introduction POS: NOUN
the Python Deep Learning Library TensorFlow - Machine Learning Mastery Navigation Machine Learning Mastery dep: pobj with tag: NNP root.head.text: to POS: PROPN
developers dep: nsubj with tag: NNS root.head.text: awesome POS: NOUN
Blog Books dep: nsubj with tag: NNP root.head.text: Need POS: PROPN
Contact dep: pobj with tag: NNP root.head.text: About POS: PROPN
help dep: dobj with tag: NN root.head.text: Need POS: NOUN
Deep Learning dep: pobj with tag: NNP root.head.text: with POS: PROPN
the FREE Mini-Course Home Empty Menu Return dep: dobj with tag: NNP root.head.text: Take POS: PROPN
Content Introduction dep: pobj with tag: NNP root.head.text: to POS: PROPN
the Python Deep Learning Library TensorFlow dep: pobj with tag: NNP root.head.text: to POS: PROPN
Jason Brownlee dep: pobj with tag: NNP root.head.text: By POS: PROPN
May dep: pobj with tag: NNP root.head.text: on POS: PROPN
Deep Learning Share dep: pobj wi

In [64]:
keywords

['the Python Deep Learning Library TensorFlow - Machine Learning Mastery Navigation Machine Learning Mastery',
 'Blog Books',
 'Contact',
 'Deep Learning',
 'Content Introduction',
 'the Python Deep Learning Library TensorFlow',
 'Jason Brownlee',
 'May',
 'Deep Learning Share',
 'Twitter Tweet Share',
 'Google Plus Share TensorFlow',
 'Google',
 'TensorFlow',
 'Deep Learning',
 'the Python Deep Learning Library TensorFlow Photo',
 'Nicolas Raymond',
 'TensorFlow',
 'TensorFlow',
 'Google',
 'Deep Learning',
 'Theano',
 'Install TensorFlow Installation',
 'TensorFlow',
 'TensorFlow',
 'Python',
 'Installation',
 'the Download',
 'the GPU',
 'TensorFlow Computation',
 'TensorFlow',
 'tf',
 'TensorFlow',
 'TensorFlow',
 'tf',
 'np',
 'NumPy',
 'W',
 'Tensorflow',
 'xrange(201',
 'NumPy',
 'W',
 'Tensorflow',
 'xrange',
 'TensorFlow',
 'Deep Learning',
 'Python',
 'Udacity TensorFlow',
 'TensorFlow',
 'Your Progress',
 'Deep Learning',
 'Minutes',
 'Python Discover',
 'my new Ebook',
 'Py