In [None]:
# Not re-inventing the wheel.
# Copied from edison project.

In [1]:
import os
import spacy

from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_lg')

## Objective

Take something like:

1. Learn Computer Science With JavaScript: Part 4, Functions
1. Create Interactive Charts Using Plotly.js, Part 1: Getting Started

And come out with keywords:

1. Computer Science
1. JavaScript
1. Functions
1. Interactive Charts
1. Plotly.js

## Acceptance Criteria

It is okay for the extractor to extract things that are not exactly concepts. For example, in the case above, it might also come up with:

1. Getting Started

Another classifier can use Word2Vec to exclude generic terms, but it's not in the scope of this extractor, because we're only looking at the grammatical structure.

## Core idea

Encode every token inside a title as:

1. The tag of the parent
1. The dependency to the parent
1. ...

### Exploration

In [3]:
doc = nlp('Create Interactive Charts Using Plotly.js, Part 1: Getting Started')
displacy.render(doc, jupyter=True)

  "__main__", mod_spec)
  "__main__", mod_spec)


In [4]:
def test_features(token):
    print("Token:", token)
    print("Token tag:", token.tag_)
    print("Token parent dep:", token.dep_)
    print("Token parent tag:", token.head.tag_)
    print("Children:")

In [5]:
test_features(doc[1])

Token: Interactive
Token tag: NNP
Token parent dep: compound
Token parent tag: NNS
Children:


In [6]:
test_features(doc[2])

Token: Charts
Token tag: NNS
Token parent dep: dobj
Token parent tag: VB
Children:


#### ---------------- Here, it ends the concept extraction login ---------------- ####

### Following is the logic to extract concepts from all the html files ------

In [7]:
# Playing with #noun_chunks
for np in doc.noun_chunks:
    print(np.text, np.root.dep_, np.root.head.text)

Interactive Charts dobj Create
Plotly.js dobj Using
Part appos Charts


In [8]:
from bs4 import BeautifulSoup

In [9]:
# Get rid of HTML tags
def clean_me(html):
    soup = BeautifulSoup(html, "lxml")
    for s in soup(['script', 'style']):
        s.decompose()
    return ' '.join(soup.stripped_strings)

In [12]:
# Read all HTML files from Machine Learning Mastery articles folder.
html_files = os.listdir("Machine Learning Mastery articles")

# Initialise a list to capture all keywords
keywords = dict()

for html_file in html_files:
    try:
        with open("Machine Learning Mastery articles/" + html_file) as f:
            html = f.read()
            scrubbed_content = clean_me(html)
            current_doc = nlp(scrubbed_content)

            for np in current_doc.noun_chunks:
                if np.root.dep_ in ['pobj', 'nsubj','compound'] and np.root.tag_ == 'NNP' and np.root.pos_ == 'PROPN':
                    if np.text in keywords:
                        keywords[np.text] += 1
                    else:
                        keywords[np.text] = 0

                    # Handling words differently
            #         if np.root.dep_ == 'nsubj' and np.root.tag_ == 'NNP' and np.root.pos_ == 'PROPN':
            #             keywords.append(np.text)
            #         if np.root.dep_ == 'compound' and np.root.tag_ == 'NNP' and np.root.pos_ == 'PROPN':
            #             keywords.append(np.text)
            #         if np.root.dep_ == 'dobj' and np.root.tag_ == 'NN' and np.root.pos_ == 'NOUN':
            #             keywords.append(np.text)
            #         print(np.text + " dep: " + np.root.dep_ + " with tag: " + np.root.tag_ + " root.head.text: " + np.root.head.text + " POS: " + np.root.pos_)
    except:
        print(html_file)

.ipynb_checkpoints


In [13]:
keywords

{'the Python Deep Learning Library TensorFlow - Machine Learning Mastery Navigation Machine Learning Mastery': 0,
 'Blog Books': 514,
 'Contact': 497,
 'Deep Learning': 500,
 'Content Introduction': 4,
 'the Python Deep Learning Library TensorFlow': 2,
 'Jason Brownlee': 1207,
 'May': 108,
 'Deep Learning Share': 46,
 'Twitter Tweet Share': 1093,
 'Google Plus Share TensorFlow': 0,
 'Google': 36,
 'TensorFlow': 46,
 'the Python Deep Learning Library TensorFlow Photo': 0,
 'Nicolas Raymond': 1,
 'Theano': 68,
 'Install TensorFlow Installation': 0,
 'Python': 4391,
 'Installation': 0,
 'the Download': 0,
 'the GPU': 10,
 'TensorFlow Computation': 0,
 'tf': 5,
 'np': 10,
 'NumPy': 20,
 'W': 1,
 'Tensorflow': 22,
 'xrange(201': 0,
 'xrange': 0,
 'Udacity TensorFlow': 0,
 'Your Progress': 84,
 'Minutes': 109,
 'Python Discover': 63,
 'my new Ebook': 395,
 'Google Plus Share': 768,
 'Jason Brownlee Dr. Jason Brownlee': 543,
 'Machine Learning Developers Introduction': 0,
 'Machine Learning':

In [15]:
### Step 8: Write JSON data in text format in `scrubbed_machine_learning_mastery.json` ###

# write scrubbed JSON to `scrubbed_machine_learning_mastery.json`
import json
with open('extract_keywords.json', 'w') as outfile:
     json.dump(keywords, outfile, sort_keys = True, indent = 4,
               ensure_ascii = False)
print('Done writing to file!!!')

Done writing to file!!!


------ Below are the experimentations with textacy

In [None]:
#!pip install textacy

# !pip uninstall -y textacy[all]

In [1]:
import os
### Step 1: Import necessary packages here ###

import csv
import pandas
import numpy

# feedparser helps to xml to hash
# Install: conda install feedparser
import feedparser

# BeautifulSoup helps to grab text out of html
# Install: conda install beautifulsoup4
from bs4 import BeautifulSoup

import json
import urllib3

from collections import Counter

import newspaper
from newspaper import Article
from pandas import read_csv
from lxml import html
import requests

import textacy
import textacy.datasets

In [3]:
content = '''
     The apparent symmetry between the quark and lepton families of
     the Standard Model (SM) are, at the very least, suggestive of
     a more fundamental relationship between them. In some Beyond the
     Standard Model theories, such interactions are mediated by
     leptoquarks (LQs): hypothetical color-triplet bosons with both
     lepton and baryon number and fractional electric charge.'''
metadata = {
     'title': 'A Search for 2nd-generation Leptoquarks at √s = 7 TeV',
     'author': 'Burton DeWilde',
     'pub_date': '2012-08-01'}
doc = textacy.Doc(content)
# print(doc)

doc.to_bag_of_terms(ngrams=2, named_entities=True,lemmatize=True, as_strings=True)

{'': 4,
 'apparent symmetry': 1,
 'baryon numb': 1,
 'electric charge': 1,
 'fractional electric': 1,
 'fundamental relationship': 1,
 'hypothetical color': 1,
 'lepton family': 1,
 'model theory': 1,
 'standard model': 2,
 'the apparent': 1,
 'triplet boson': 1}

In [9]:
#### with textacy:


html_files = os.listdir("Machine Learning Mastery articles")
html_file  = html_files[0]

html_file

keywords = list()

with open("Machine Learning Mastery articles/" + html_file) as f:
    html = f.read()

scrubbed_content = clean_me(html)
current_doc = textacy.Doc(scrubbed_content)
current_doc.to_bag_of_terms(ngrams=2, named_entities=True,lemmatize=True, as_strings=True)

{'': 35,
 '# create 100': 2,
 '+ 0.3': 4,
 '+ b': 5,
 '-PRON- be': 1,
 '-PRON- could': 1,
 '-PRON- cover': 1,
 '-PRON- first': 3,
 '-PRON- first neural network': 1,
 '-PRON- free': 1,
 '-PRON- goal': 1,
 '-PRON- intend': 1,
 '-PRON- know': 2,
 '-PRON- learn': 1,
 '-PRON- own': 1,
 '-PRON- progress': 1,
 '-PRON- show': 1,
 '-PRON- tensorflow': 1,
 '-PRON- think': 2,
 '0': 3,
 '0.1': 8,
 '0.1 +': 4,
 '0.1000006': 2,
 '0.10000207': 2,
 '0.10000713': 2,
 '0.10002445': 2,
 '0.10008363': 2,
 '0.10028629': 2,
 '0.1009799': 2,
 '0.10335406': 2,
 '0.11148042': 2,
 '0.13929555': 2,
 '0.2629351': 2,
 '0.27992988': 2,
 '0.28697217': 2,
 '0.2941364': 2,
 '0.29828694': 2,
 '0.29995731': 2,
 '0.29999638': 2,
 '0.29999897': 2,
 '0.29999971': 2,
 '0.3': 4,
 '0.3 x_data': 1,
 '0.5': 1,
 '1': 4,
 '1 /usr': 1,
 '1 2': 3,
 '1 2 3 4 5': 1,
 '1 42': 1,
 '1 python': 1,
 '1.0': 2,
 '10': 2,
 '10 11': 2,
 '10 response': 1,
 '100': 3,
 '100 phony': 2,
 '11 12': 1,
 '12 13': 1,
 '120': 2,
 '12:00 pm': 1,
 '13': 1