In [1]:
from __future__ import division
from pyspark import SparkConf, SparkContext

sc = SparkContext(conf=SparkConf().setAppName("MyApp").setMaster("local"))

In [12]:
import re
from math import log

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return words
    except ValueError as e:
        return []

def PMI(probability_a, probability_b, probability_ab):
    return log(probability_ab / (probability_a * probability_b))
               
def NPMI(pair, count_pair, count_all_pairs, count_all_words, words_frequency_dict): 
    a, b = pair.split('_')
    probability_a = words_frequency_dict[a]/count_all_words
    probability_b = words_frequency_dict[b]/count_all_words
    probability_ab = count_pair/count_all_pairs
    pmi = PMI(probability_a, probability_b, probability_ab)/ -log(probability_ab)
    return (pair, pmi)
               
def bigrams(text):
    all_pairs = [(a+'_'+b,1) for a,b in zip(text[:-1], text[1:])]
    return all_pairs

In [13]:
with open('/datasets/stop_words_en.txt', 'r') as f:
    stop_words = set(f.read().split())

In [14]:
wiki = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).map(parse_article)

In [15]:
wiki_filter = wiki.map(lambda words: [x.lower() for x in words])\
                  .map(lambda words: [x for x in words if x not in stop_words])
wiki_bigrams = wiki_filter.flatMap(bigrams).reduceByKey(lambda a, b: a + b)
wiki_top_bigrams = wiki_bigrams.filter(lambda (pair, count): count >= 500)

In [16]:
all_words = wiki_filter.map(lambda words: len(words))
count_all_words = all_words.reduce(lambda a, b: a + b)

In [17]:
all_pairs = wiki_filter.map(lambda words: len(words)-1)
count_all_pairs = all_pairs.reduce(lambda a, b: a + b)

In [18]:
words_frequency = wiki_filter.flatMap(lambda words: [(x, 1) for x in words])\
                           .reduceByKey(lambda a,b: a+b)\
                           .filter(lambda (pair, count): count >= 500)\
                           .collect()


words_frequency_dict = dict()
for item, count in words_frequency:
    words_frequency_dict[item] = count

In [19]:
pairs_npmi = wiki_top_bigrams.map(lambda (pair, count): NPMI(pair, count, count_all_pairs, 
                                                             count_all_words, words_frequency_dict))\
    .map(lambda (a, b): (b, a))\
    .sortByKey(False)\
    .map(lambda (a, b): (b, a))\
    .take(39)
    
for pair, npmi in pairs_npmi:
    print pair

los_angeles
external_links
united_states
prime_minister
san_francisco
et_al
new_york
supreme_court
19th_century
20th_century
references_external
soviet_union
air_force
baseball_player
university_press
roman_catholic
united_kingdom
references_reading
notes_references
award_best
north_america
new_zealand
civil_war
catholic_church
world_war
war_ii
south_africa
took_place
roman_empire
united_nations
american_singer-songwriter
high_school
american_actor
american_actress
american_baseball
york_city
american_football
years_later
north_american


In [None]:
# los_angeles
# external_links
# united_states
# prime_minister
# san_francisco
# et_al
# new_york
# supreme_court
# 19th_century
# 20th_century
# references_external
# soviet_union
# air_force
# baseball_player
# university_press
# roman_catholic
# united_kingdom
# references_reading
# notes_references
# award_best
# north_america
# new_zealand
# civil_war
# catholic_church
# world_war
# war_ii
# south_africa
# took_place
# roman_empire
# united_nations
# american_singer-songwriter
# high_school
# american_actor
# american_actress
# american_baseball
# york_city
# american_football
# years_later
# north_american