# Topic Modeling 

What are the most recent tech innovations and subjects people write about on the web?

We will build a dataset from rss feeds of O'Reilly radar, ArsTechnica and IEEE Spectrum websites and with topic modeling find the main topics that these sites talk about.




# 1. Create or load the dataset

Run the code below. This will create a list of 995 documents.
**Note**: You can edit the list of rss feeds below.

First install feedparser with

        pip install feedparser
        
You can also load the data with pickle

        import pickle
        file = open("../data/techno_feed.txt",'rb')
        docs = pickle.load(file)


# 2. Parse and clean

Remove 
* html tags with Beautifulsoup
* All ponctuation (watch out for weird characters like \xa0)
* Stopwords with NLTK

And tokenize with NLTK

# 3. TfIdf and LDA

With Gensim

* Create a tf-Idf representation of the documents
* Apply an LDA model to the corpus. 
    * try different number of topics
    
* Use LDAVis to visualize your topics




In [1]:
# Code to create the documents

import feedparser
# http://spectrum.ieee.org/static/rss

feeds = [ 'http://spectrum.ieee.org/rss/blog/energywise/fulltext',
'http://spectrum.ieee.org/rss/blog/cars-that-think/fulltext',
'http://spectrum.ieee.org/rss/blog/the-human-os/fulltext',
'http://spectrum.ieee.org/rss/blog/riskfactor/fulltext',
'http://spectrum.ieee.org/rss/blog/nanoclast/fulltext',
'http://spectrum.ieee.org/rss/blog/tech-talk/fulltext',
'http://spectrum.ieee.org/rss/blog/view-from-the-valley/fulltext',
'http://spectrum.ieee.org/rss/aerospace/fulltext',
'http://spectrum.ieee.org/rss/at-work/fulltext',
'http://spectrum.ieee.org/rss/blog/automaton/fulltext',
'http://strata.oreilly.com/feed',
'http://feeds.arstechnica.com/arstechnica/technology-lab',
'http://feeds.arstechnica.com/arstechnica/gadgets',
'http://feeds.arstechnica.com/arstechnica/business',
'http://feeds.arstechnica.com/arstechnica/security',
'http://feeds.arstechnica.com/arstechnica/tech-policy',
'http://feeds.arstechnica.com/arstechnica/apple',
'http://feeds.arstechnica.com/arstechnica/gaming',
'http://feeds.arstechnica.com/arstechnica/science',
'http://feeds.arstechnica.com/arstechnica/multiverse',
'http://feeds.arstechnica.com/arstechnica/cars',
'http://feeds.arstechnica.com/arstechnica/staff-blogs',
]

documents = []

for rss in feeds:
    llog = feedparser.parse(rss)
    print("%d entries for %s" % (len(llog.entries), rss) )
    for entry in llog.entries:
        content = entry.content[0].value
        documents.append(content)



75 entries for http://spectrum.ieee.org/rss/blog/energywise/fulltext
75 entries for http://spectrum.ieee.org/rss/blog/cars-that-think/fulltext
75 entries for http://spectrum.ieee.org/rss/blog/the-human-os/fulltext
40 entries for http://spectrum.ieee.org/rss/blog/riskfactor/fulltext
75 entries for http://spectrum.ieee.org/rss/blog/nanoclast/fulltext
75 entries for http://spectrum.ieee.org/rss/blog/tech-talk/fulltext
75 entries for http://spectrum.ieee.org/rss/blog/view-from-the-valley/fulltext
75 entries for http://spectrum.ieee.org/rss/aerospace/fulltext
75 entries for http://spectrum.ieee.org/rss/at-work/fulltext
75 entries for http://spectrum.ieee.org/rss/blog/automaton/fulltext
60 entries for http://strata.oreilly.com/feed
20 entries for http://feeds.arstechnica.com/arstechnica/technology-lab
20 entries for http://feeds.arstechnica.com/arstechnica/gadgets
20 entries for http://feeds.arstechnica.com/arstechnica/business
20 entries for http://feeds.arstechnica.com/arstechnica/security

In [54]:
import numpy as np
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora, models, similarities

from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

import string


In [55]:
import pickle
file = open("../data/techno_feed.txt",'rb')
docs = pickle.load(file)



In [57]:
stop = set(stopwords.words('english'))
stop.update('also', 'might','much','says','it','its')
# list of Punctuation characters
punctuation_chars = list(string.punctuation)+["“,’"]

def cleanup(raw):
    # lowercase
    raw = raw.lower()
    # @ and dot in emails => keep recipients and domain names as words
    raw = re.sub('[@.]', ' ', raw)    
    # ponctuation
    raw = ''.join([ch for ch in raw if ch not in punctuation_chars])
    # numbers
    raw = re.sub('[0-9]+', '', raw)
    # tokenize
    raw = word_tokenize(raw)
    # stop words
    raw = [w for w in raw if w not in stop]
    # at least 3 letters
    raw = [w for w in raw if len(w) > 2]

    return raw



In [58]:
from bs4 import BeautifulSoup

clean_documents = []
for document in docs: clean_documents.append(BeautifulSoup(document, "html.parser").get_text())

In [59]:
tokenized = [ cleanup(raw) for raw in clean_documents ]


In [60]:
tok_doc = tokenized[0]
tok_doc.sort()
tok_doc

['abundance',
 'abundant',
 'acid',
 'active',
 'adding',
 'addition',
 'adds',
 'adds',
 'adoption',
 'advances',
 'allows',
 'also',
 'also',
 'also',
 'aluminum',
 'aluminum',
 'aluminum',
 'aluminum',
 'aluminumbased',
 'aluminums',
 'amounts',
 'amperehours',
 'anode',
 'anode',
 'anode',
 'anode',
 'anodes',
 'archer',
 'archer',
 'archer',
 'archer',
 'archer',
 'archer',
 'archer',
 'archer',
 'bridging',
 'capture',
 'capture',
 'capture',
 'captured',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbon',
 'carbonates',
 'carbonrich',
 'catalyst',
 'cathode',
 'cathode',
 'cathode',
 'cathode',
 'cathode',
 'ccus',
 'cell',
 'cell',
 'cells',
 'cells',
 'cells',
 'cheap',
 'chemical',
 'chemical',
 'chemically',
 'chemicals',
 'chemicals',
 'chemicals',
 'colleague',
 'colleagues',
 'compounds',
 'compounds',
 'compounds',
 'conductive',
 'consists',
 'consists',
 'convert',
 'convert',
 '

In [66]:
stop.update(['like', 'dont', 'one', 'would', 'new', 'get', 'also', 'could', 'might','much','says','it','its'])
punctuation_chars = list(string.punctuation)+["“,’"]


print()
print("rm HTML")
from bs4 import BeautifulSoup

clean_documents = []
for document in docs: clean_documents.append(BeautifulSoup(document, "html.parser").get_text())

print()
print("Tokenize")
tokenized = list(map(cleanup, clean_documents))

print()
print("Dictionary")
dictionary = corpora.Dictionary(tokenized)
dictionary.filter_extremes(no_below=5, no_above=0.90)
print(dictionary)

print()
print("TfIdf")
corpus = [dictionary.doc2bow(text) for text in tokenized]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]




rm HTML

Tokenize

Dictionary
Dictionary(7862 unique tokens: ['anthropomorphic', 'tired', 'onetime', 'response', 'start']...)

TfIdf

LDA


[(0,
  '0.006*system + 0.006*data + 0.004*said + 0.004*time + 0.004*first + 0.003*million + 0.003*people + 0.003*information + 0.003*company + 0.003*software + 0.003*customers + 0.002*two + 0.002*make + 0.002*robot + 0.002*year'),
 (1,
  '0.005*time + 0.005*data + 0.004*used + 0.004*research + 0.003*make + 0.003*photo + 0.003*researchers + 0.003*robot + 0.003*first + 0.003*company + 0.003*technology + 0.003*work + 0.003*two + 0.002*many + 0.002*better'),
 (2,
  '0.004*first + 0.004*researchers + 0.004*it’s + 0.003*two + 0.003*car + 0.003*power + 0.003*system + 0.003*time + 0.003*make + 0.003*robot + 0.003*cars + 0.003*even + 0.003*energy + 0.003*years + 0.003*research'),
 (3,
  '0.007*robot + 0.006*data + 0.004*it’s + 0.003*system + 0.003*time + 0.003*way + 0.003*well + 0.003*company + 0.003*use + 0.003*two + 0.003*first + 0.003*work + 0.002*using + 0.002*used + 0.002*even'),
 (4,
  '0.005*robot + 0.004*it’s + 0.004*time + 0.003*even + 0.003*data + 0.003*system + 0.003*people + 0.003*m

In [68]:
print()
print("LDA")
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=50, alpha='auto', iterations = 250)
corpus_lda = lda[corpus]
lda.print_topics(50, num_words = 10)



LDA


[(0,
  '0.005*researchers + 0.005*people + 0.004*research + 0.004*photo + 0.004*data + 0.004*used + 0.004*say + 0.004*apple + 0.003*technology + 0.003*using'),
 (1,
  '0.010*cars + 0.008*car + 0.007*people + 0.006*women + 0.005*make + 0.004*may + 0.004*autonomous + 0.004*it’s + 0.004*selfdriving + 0.004*even'),
 (2,
  '0.006*time + 0.005*data + 0.005*graphene + 0.005*researchers + 0.003*research + 0.003*people + 0.003*technology + 0.003*said + 0.003*make + 0.003*system'),
 (3,
  '0.007*data + 0.006*credit + 0.005*ios + 0.005*customers + 0.005*lithium + 0.005*material + 0.005*system + 0.004*nuclear + 0.004*release + 0.004*battery'),
 (4,
  '0.007*andrew + 0.005*two + 0.004*system + 0.004*company + 0.004*time + 0.003*data + 0.003*first + 0.003*around + 0.003*team + 0.003*using'),
 (5,
  '0.005*apple + 0.004*company + 0.004*security + 0.004*said + 0.004*solar + 0.003*it’s + 0.003*team + 0.003*used + 0.003*make + 0.003*time'),
 (6,
  '0.004*system + 0.004*it’s + 0.003*researchers + 0.003*t

In [67]:
import pyLDAvis.gensim

import matplotlib.pyplot as plt
%matplotlib inline

ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(ldavis)

  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  inline backend."""
  'retina', 'jpeg', 'svg', 'pdf'.""")
  use `figure_formats` instead)""")
  """
  """)
  def _config_changed(self, name, old, new):
