In [239]:
import feedparser
import re

# Returns title and dictionary of word counts for an RSS Feed.
def get_word_counts(url):
    # Parse the feed.
    d = feedparser.parse(url)

    # The title of the feed.
    title = d.feed.title

    wc = {}
    
    # Loop over all the entries.
    for e in d.entries:
        if 'summary' in e: summary = e.summary
        else: summary = e.description
        
        # Extract a list of words.
        words = get_words(f'{title} {summary}')
        for word in words:
            wc.setdefault(word, 0)
            wc[word] += 1
    return title, wc

In [240]:
def get_words(html):
    # Remove all html tags.
    txt = re.compile(r'<[^>]+>').sub('', html)
    
    # Split words by all non-alpha characters.
    words = re.compile(r'[^A-Z^a-z]+').split(txt)
    
    # Convert to lowercase.
    return [word.lower() for word in words if word != '']

In [241]:
feedlist = '''http://feeds.feedburner.com/37signals/beMH
http://feeds.feedburner.com/blogspot/bRuz
http://feeds.feedburner.com/JohnBattellesSearchblog
https://guykawasaki.com/feed/
http://blog.outer-court.com/rss.xml
https://searchenginewatch.com/tag/rss-feed/feed/
http://www.topix.com//rss/news/blogs
http://feeds.abcnews.com/abcnews/blotterheadlines
https://gigaom.com/feed/
http://gizmodo.com/index.xml
http://gofugyourself.typepad.com/go_fug_yourself/index.rdf
http://googleblog.blogspot.com/rss.xml
http://feeds.feedburner.com/GoogleOperatingSystem
http://headrush.typepad.com/creating_passionate_users/index.rdf
http://feeds.feedburner.com/pjmedia/instapundit
https://blog.zawodny.com/feed/
http://joi.ito.com/index.rdf
http://feeds.feedburner.com/Mashable
http://michellemalkin.com/feed/
http://moblogsmoproblems.blogspot.com/rss.xml
http://newsbusters.org/node/feed
http://feeds.feedburner.com/paulstamatiou
http://feeds.feedburner.com/powerlineblog/livefeed
http://radar.oreilly.com/index.rdf
http://scienceblogs.com/pharyngula/feed/
http://scobleizer.wordpress.com/feed/
http://sethgodin.typepad.com/seths_blog/index.rdf
http://rss.slashdot.org/Slashdot/slashdot
http://thinkprogress.org/feed/
http://feeds.feedburner.com/andrewsullivan/rApM
http://wilwheaton.typepad.com/wwdnbackup/index.rdf
http://www.43folders.com/feed/
http://www.456bereastreet.com/feed.xml
http://www.autoblog.com/rss.xml
http://www.bloggersblog.com/rss.xml
http://www.blogmaverick.com/rss.xml
http://www.boingboing.net/index.rdf
http://www.buzzmachine.com/index.xml
http://www.captainsquartersblog.com/mt/index.rdf
http://feeds.coolhunting.com/ch
http://feeds.copyblogger.com/Copyblogger
http://feeds.feedburner.com/crooksandliars/YaCP
http://feeds.dailykos.com/dailykos/index.xml
http://www.deadspin.com/index.xml
http://www.huffingtonpost.com/feeds/verticals/technology/index.xml
http://www.engadget.com/rss.xml
https://www.gapingvoid.com/feed/
http://www.gothamist.com/index.rdf
http://www.huffingtonpost.com/raw_feed_index.rdf
http://www.hyperorg.com/blogger/index.rdf
http://www.joelonsoftware.com/rss.xml
http://www.kotaku.com/index.xml
http://feeds.kottke.org/main
http://www.lifehack.org/feed/
http://www.lifehacker.com/index.xml
http://site2.littlegreenfootballs.com/feed
http://makezine.com/feed/
http://www.mattcutts.com/blog/feed/
http://xml.metafilter.com/rss.xml
http://www.mezzoblue.com/rss/index.xml
http://www.neilgaiman.com/journal/feed/rss.xml
http://www.oilman.ca/feed/
http://www.perezhilton.com/index.xml
http://www.plasticbag.org/index.rdf
http://www.powazek.com/rss.xml
http://www.problogger.net/feed/
http://feeds.feedburner.com/QuickOnlineTips
http://readwrite.com/feed/
http://www.schneier.com/blog/index.rdf
http://scienceblogs.com/feed/
http://www.seroundtable.com/index.rdf
http://www.shoemoney.com/feed/
http://www.sifry.com/alerts?format=RSS
http://simplebits.com/feed.xml
http://feeds.feedburner.com/Spikedhumor
http://www.stevepavlina.com/blog/feed
https://talkingpointsmemo.com/feed/all
http://www.tbray.org/ongoing/ongoing.rss
http://feeds.feedburner.com/TechCrunch
http://www.techdirt.com/techdirt_rss.xml
http://www.techeblog.com/elephant/?mode=atom
http://www.thesuperficial.com/feed
http://www.tmz.com/rss.xml
https://www.treehugger.com/feeds/latest/
http://feeds.gawker.com/gizmodo/full
http://we-make-money-not-art.com/feed/
http://www.wired.com/rss/index.xml
https://wonkette.com/feed'''.split('\n')

In [242]:
import os

if not os.path.exists('feed.pickle'):
    # The number of blogs each word appeared.
    appeared_counts = {}

    # The word count for each blog.
    word_counts = {}

    for url in feedlist:
        try:
            title, wc = get_word_counts(url)
            word_counts[title] = wc
            for word, count in wc.items():
                appeared_counts.setdefault(word, 0)
                if count > 1:
                    appeared_counts[word] += 1
        except Exception as e:
            print(e)

True

True

In [243]:
import pickle

try:
    with open('feed.pickle', 'rb') as f:
        data = pickle.load(f)
        print('loaded', len(data))
        word_counts = data['word_counts']
        appeared_counts = data['appeared_counts']
except EOFError as e:
    with open('feed.pickle', 'wb') as f:
        data = {'appeared_counts': appeared_counts,
                    'word_counts': word_counts}
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

loaded 2


In [244]:
wordlist = []
for w, bc in appeared_counts.items():
    frac = float(bc) / len(feedlist)
    # Skip the ones that are less than 10% and more than 50%.
    if frac > 0.1 and frac < 0.5: wordlist.append(w)

In [245]:
with open('blogdata.txt', 'w') as f:
    # First line is the header.
    f.write('Blog')
    for word in wordlist: f.write(f'\t{word}')
    f.write('\n')
    
    # Subsequent lines are the body.
    for blog, wc in word_counts.items():
        if blog.strip() == '':
            continue
        f.write(blog)
        for word in wordlist:
            if word in wc: f.write(f'\t{wc[word]}')
            else: f.write(f'\t{0}')
        f.write('\n')
True

True

In [246]:
def read_file(filename):
    lines = [line for line in open(filename)]
    
    # First line is the column title.
    cols = lines[0].strip().split('\t')[1:]
    rows = []
    data = []
    for line in lines[1:]:
        p = line.strip().split('\t')
        title = p[0]
        if title.strip() == '': continue
        
        # First column in each row is the row name.
        rows.append(p[0])
        
        # The data for this row is the remainder of the row.
        data.append([float(x) for x in p[1:]])
    return rows, cols, data

In [247]:
rows, cols, data = read_file('blogdata.txt')

In [248]:
from math import sqrt

def pearsonr(v1, v2):
    """Pearson correlation determines how similar two items are."""
    
    n = min(len(v1), len(v2))
    v1, v2 = v1[:n], v2[:n]
    # Simple sums.
    sum1 = sum(v1)
    sum2 = sum(v2)
    
    
    # Sum of squares.
    sum1_square = sum(pow(v, 2) for v in v1)
    sum2_square = sum(pow(v, 2) for v in v2)
    
    # Sum of products.
    sum_products = sum([v1[i] * v2[i] for i in range(n)])
    
    # Calculate the pearson score.
    num = sum_products - (sum1 * sum2 / n)
    den = sqrt((sum1_square - pow(sum1, 2) / n) * (sum2_square - pow(sum2, 2) / n))
    if den == 0: return 0
    return num / den

In [249]:
class bicluster:
    def __init__(self, vec, left = None, right = None, distance = 0.0, id = None):
        self.left = left
        self.right = right
        self.vec = vec
        self.distance = distance
        self.id = id

In [254]:
def hcluster(rows, distance = pearsonr):
    # Distances is the cache of the distance calculation.
    distances = {}
    current_cluster_id = -1
    
    # Clusters are initially just the rows.
    cluster = [bicluster(rows[i], id = i) for i in range(len(rows))]
    
    while len(cluster) > 1:
        lowest_pair = (0, 1)
        closest = distance(cluster[0].vec, cluster[1].vec)
        
        # Loop through every pair looking for the smallest distance.
        for i in range(len(cluster)):
            for j in range(i + 1, len(cluster)):
                key = (cluster[i].id, cluster[j].id)
                if key not in distances:
                    distances[key] = distance(cluster[i].vec, cluster[j].vec)
                d = distances[key]
                if d < closest:
                    closest = d
                    lowest_pair = (i, j)
        (x, y) = lowest_pair

        # Calculate the average of the two clusters.
        merge_vectors = [(cluster[x].vec[i] + cluster[y].vec[i])/2.0 
                         for i in range(len(cluster[0].vec))]
        
        # Create the new cluster.
        new_cluster = bicluster(merge_vectors, 
                                left=cluster[x],
                                right=cluster[y],
                                distance=closest,
                                id=current_cluster_id)
        
        # Cluster ids that weren't in the original set are negative.
        current_cluster_id -= 1
        del cluster[y]
        del cluster[x]
        cluster.append(new_cluster)
    return cluster[0]

In [255]:
blognames, words, data = read_file('blogdata.txt')
cluster = hcluster(data)

In [285]:
def print_cluster(cluster, labels = None, n = 0):
    # Indent to make a hierachy layout.
    for i in range(n): print(' ', end='')
    if cluster.id < 0:
        # Negative id means that this is branch.
        print('-')
    else:
        # Positive id means that this is an endpoint.
        if labels == None: print(cluster.id),
        else: print(labels[cluster.id]),
    
    # Now print the right and left branches.
    if cluster.left != None: print_cluster(cluster.left, labels = labels, n = n + 1)
    if cluster.right != None: print_cluster(cluster.right, labels = labels, n= n + 1)

In [286]:
print_cluster(cluster, blognames)

-
 -
  -
   -
    The Official Google Blog
    -
     Neil Gaiman's Journal
     -
      COOL HUNTING
      -
       Schneier on Security
       Talking Points Memo
   -
    Wonkette
    -
     456 Berea Street
     -
      -
       Seth Godin's Blog on marketing, tribes and respect
       -
        Power LinePower Line
        -
         PaulStamatiou.com - Technology, Design and Photography
         Engadget RSS Feed
      -
       Matt Cutts: Gadgets, Google, and SEO
       ReadWrite
  -
   -
    Slashdot
    Derek Powazek
   -
    -
     Eschaton
     -
      Mashable
      -
       Google Blogoscoped
       -
        Captain's Quarters
        -
         -
          Autoblog
          Lifehack - Feed
         -
          O'Reilly Radar
          Tech
    -
     The Viral Garden
     -
      -
       The Write News
       Sifry's Alerts - David Sifry
      -
       -
        Instapundit
        -
         Google Operating System
         BuzzMachine
       -
        The Dish
      