# Spark demo : Part II #

In [0]:
sc

In [0]:
import re

def is_post_row(line):
    return re.match('  <row Id=', line) != None

row_rdd = sc.textFile('./data/ai.stackexchange.com/Posts.xml') \
    .filter(is_post_row)

In [0]:
import HTMLParser
h = HTMLParser.HTMLParser()

def is_first_post(line):
    return re.match('  <row Id="\d+" PostTypeId="1"', line)

def extract_body(line):
    matches = re.findall('Body="(.*?)"', line)
    parsed_text = h.unescape(matches[0])
    return parsed_text

questions = row_rdd \
    .filter(is_first_post) \
    .map(extract_body) \
    .take(5)
    
print ("\n"+("="*80)+"\n\n").join(questions)


In [0]:
word_counts = row_rdd \
    .map(extract_body) \
    .flatMap(lambda line: [word.lower() for word in re.findall('\w+', line)]) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda x, y: x+y) \
    .map(lambda (word, count): (count, word)) \
    .sortByKey(ascending=False) \
    .map(lambda (count, word): (word, count)) \
    .take(50)
    
for wc in word_counts[10:]:
    print wc

In [0]:
def extract_favorite_count(row):
    matches = re.findall('FavoriteCount="(\d+)"', row)
    if len(matches) > 0:
        return int(matches[0])
    else:
        return 0

row_rdd \
    .filter(is_first_post) \
    .map(extract_favorite_count) \
    .map(lambda x: (x,1)) \
    .reduceByKey(lambda x,y: x+y) \
    .sortByKey(ascending=True) \
    .collect()

In [0]:
def get_rdd_word_count(rdd):
    return rdd \
        .map(extract_body) \
        .flatMap(lambda line: [word.lower() for word in re.findall('\w+', line)]) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda x, y: x+y) \
        .map(lambda (word, count): (count, word)) \
        .sortByKey(ascending=False) \
        .map(lambda (count, word): (word, count)) \

word_counts_from_favorited_posts = get_rdd_word_count(
    row_rdd \
        .filter(is_first_post) \
        .filter(lambda x: extract_favorite_count(x)>0)
)

word_counts_from_nonfavorited_posts = get_rdd_word_count(
    row_rdd \
        .filter(is_first_post) \
        .filter(lambda x: extract_favorite_count(x)==0)
)

word_counts_from_favorited_posts
word_counts_from_nonfavorited_posts

In [0]:
word_counts_from_favorited_posts.take(10)

In [0]:
total_words_from_favorited_posts = word_counts_from_favorited_posts \
    .map(lambda x: x[1]) \
    .reduce(lambda x, y: x+y)

total_words_from_nonfavorited_posts = word_counts_from_nonfavorited_posts \
    .map(lambda x: x[1]) \
    .reduce(lambda x, y: x+y)
    
print total_words_from_favorited_posts
print total_words_from_nonfavorited_posts

In [0]:
from math import log
log_freq_from_favorite_posts = word_counts_from_favorited_posts.map(lambda x: (x[0], log(x[1]) - log(total_words_from_favorited_posts)))
log_freq_from_nonfavorite_posts = word_counts_from_nonfavorited_posts.map(lambda x: (x[0], log(x[1]) - log(total_words_from_nonfavorited_posts)))
log_freq_from_favorite_posts.take(10)

In [0]:
most_favorited_words = log_freq_from_favorite_posts \
    .join(log_freq_from_nonfavorite_posts) \
    .map(lambda x: (x[0], x[1][0] - x[1][1])) \
    .map(lambda (word, log_diff): (log_diff, word)) \
    .sortByKey(ascending=False) \
    .take(20)

most_favorited_words

### Exercise 1 ###

What are the 20 most common words in each of these sets?
- Write an RDD to generate counts for all the words that are at least 5 characters long.
- Filter this RDD to only include favorited posts.
- Filter this RDD to only include posts *without* any favorites.



### Exercise 2 ###

The logic above creates two separate RDDs for posts with and without favorites, then joins them.

There's another way to accomplish the same thing in a single RDD. Hint: you'll need a tuple with (word, count, has_favorites)

Generate word counts for favorited posts and non-favorited posts.

### Bonus Exercise ###

Which post in this data set has the most favorites?