In [52]:
# corpus 
# https://blog.bufferapp.com/the-most-popular-words-in-most-viral-headlines

# markov inspiration
# http://agiliq.com/blog/2009/06/generating-pseudo-random-text-with-markov-chains-u/

In [12]:
# TODO:
# scrape? http://www.clickhole.com/
# scrape? http://www.buzzfeed.com/
#         http://www.jeffbullas.com/2015/01/16/22-headlines-that-went-viral-have-these-marketers-cracked-the-code/
# scrape? businessinsider
# scrape? feedly

In [110]:
import pandas as pd
import nltk
import ftfy

### load headlines into Pandas DataFrame

In [20]:
sheetnames = ['Buzzfeed', 'ViralNova', 'Upworthy'] # , 'Wimp', 'Feedly']

In [43]:
df = pd.DataFrame()
for sheetname in sheetnames:
    print(sheetname)
    dfs = pd.read_excel( r'data/Viral-Title-Analysis-ripenn.xlsx', sheetname=sheetname )
    dfs['sheetname'] = sheetname
    df = pd.concat( [df, dfs], ignore_index=True )

Buzzfeed
ViralNova
Upworthy


In [44]:
df.columns

Index([               u'+1s',         u'CHAR COUNT',          u'Delicious',
                    u'Diggs',        u'FB Comments',           u'FB Likes',
                u'FB Shares',           u'FB Total',       u'FIRST PERSON',
          u'LinkedIn Shares',           u'NEGATIVE',             u'NUMBER',
                     u'Pins',           u'QUESTION',             u'Reddit',
       u'SEXUAL ORIENTATION',        u'StumbleUpon',              u'TITLE',
                   u'Tweets',                u'URL',                u'WHY',
                u'sheetname'],
      dtype='object')

In [45]:
df = df[['TITLE', 'sheetname']]

In [47]:
df.head()

Unnamed: 0,TITLE,sheetname
0,22 Messages From Creationists To People Who Be...,Buzzfeed
1,10 Pictures That Prove Bruno Mars Is Actually ...,Buzzfeed
2,23 Things All Servers Will Understand,Buzzfeed
3,10 Ways Canada Has Already Won The Winter Olym...,Buzzfeed
4,27 Weird And Creepy Vintage Valentine’s Day Cards,Buzzfeed


### get headlines to a list and clean

In [101]:
headlines = df['TITLE'].tolist()

In [111]:
headlines = [ ftfy.fix_text(t) for t in headlines ]

### creating a class for Markov chain

In [137]:
import random

class Markov(object):
    def __init__(self, lst):
        self.cache = {}
        self.sentences = self.tokenize_sentences( lst )
        self.create_cache()
        
    def tokenize_sentences(self, lst):
        return [nltk.word_tokenize(s) for s in lst]
        
    def create_cache(self):
        for s in self.sentences:
            for i, t in enumerate(s[:-2]):
                key = (s[i], s[i+1])
                w = s[i+2]
                if key not in self.cache:
                    self.cache[ key ] = []
                self.cache[key].append(w)
        
    def generate(self, size=10):
        rand_seed_sent = random.randint(0, len(self.sentences))
        sentence = self.sentences[rand_seed_sent]
        print(sentence)
        rand_seed_word = random.randint(0, len(sentence) - 2)
        w1, w2 = sentence[rand_seed_word], sentence[rand_seed_word + 1]
        print('original seed:')
        print(w1)
        print(w2)
        print('- - - - - - -')
        outcome = []
        for i in xrange(size):
            outcome.append(w1)
            print('lookup:')
            print((w1,w2))
            print('pick from:')
            print(self.cache[(w1, w2)])
            w1, w2 = w2, random.choice( self.cache[(w1, w2)] )
            print('new pair:')
            print (w1, w2)
            print
        outcome.append(w2)
        return ' '.join(outcome)

In [138]:
m = Markov( headlines )

In [139]:
m.cache

{(u'Guide', u'To'): [u'Figuring', u'Drawing', u'Becoming', u'Middle'],
 (u'He', u'Killed'): [u'Himself'],
 (u'Me', u'Out'): [u'Of', u'Of'],
 (u'Photo', u'Should'): [u'Be'],
 (u'The', u'Drugs'): [u'Kids'],
 (u'In', u'1990'): [u'.'],
 (u'Both', u'Disturbing'): [u'And'],
 (u'Thing', u'So'): [u'Bad'],
 (u'16', u'Cats'): [u'Who'],
 (u'Letter', u'\u2013'): [u'And'],
 (u'4', u'Insanely'): [u'Important'],
 (u'Coca-Cola', u"'s"): [u'Controversial'],
 (u'Boat', u','): [u'And'],
 (u'He', u'Turned'): [u'It'],
 (u'Without', u'A'): [u'Condom'],
 (u'35', u'Life'): [u'Hacks'],
 (u'His', u"'Revenge"): [u"'"],
 (u'Was', u'Doomed'): [u'After'],
 (u'These', u'Celebrities'): [u'Hate'],
 (u'Humiliate', u'A'): [u'Hate'],
 (u'Did', u'The'): [u'Unimaginable'],
 (u'26', u'Dumbest'): [u'Things'],
 (u'Trap', u'Door'): [u'In', u'In'],
 (u'So', u'Horrifying'): [u','],
 (u'Woman', u'Render'): [u'Jon'],
 (u'State', u'Dinner'): [u'Dress', u'Tonight'],
 (u'Harry', u'Potter'): [u'Addict', u'World', u'Has', u'And'],
 (u'

In [140]:
m.generate()

[u'You', u"'ll", u'Have', u'No', u'Idea', u'What', u'You', u"'re", u'Seeing', u'In', u'These', u'20', u'Photos', u'.', u'But', u'Look', u'A', u'Little', u'Closer\u2026']
original seed:
Look
A
- - - - - - -
lookup:
(u'Look', u'A')
pick from:
[u'Little', u'Little', u'Little', u'Whole']
new pair:
(u'A', u'Whole')

lookup:
(u'A', u'Whole')
pick from:
[u'New', u'Lot', u'New', u'Lot']
new pair:
(u'Whole', u'Lot')

lookup:
(u'Whole', u'Lot')
pick from:
[u'Better', u'Easier']
new pair:
(u'Lot', u'Better')

lookup:
(u'Lot', u'Better')
pick from:
[u'Than', u'.']
new pair:
(u'Better', u'Than')

lookup:
(u'Better', u'Than')
pick from:
[u'Your', u'The', u'A', u'Ranch', u'Yours', u'UNC', u'A', u'These', u'You', u'The', u'Food']
new pair:
(u'Than', u'A')

lookup:
(u'Than', u'A')
pick from:
[u'Boyfriend', u'Crow', u'Bully', u'Photo']
new pair:
(u'A', u'Bully')

lookup:
(u'A', u'Bully')
pick from:
[u'Getting']
new pair:
(u'Bully', u'Getting')

lookup:
(u'Bully', u'Getting')
pick from:
[u'What']
new pai

u'Look A Whole Lot Better Than A Bully Getting What Did'