In [7]:
from pyspark import SparkConf, SparkContext
from math import log
sc = SparkContext.getOrCreate(conf=SparkConf().setAppName("MyApp").setMaster("local"))

import re

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        words = [word.lower() for word in words]
        words.append("END")
        return words
    except ValueError as e:
        return []
    
    
stopWords = sc.textFile("/datasets/stop_words_en.txt").collect()    

stop = sc.broadcast(stopWords)

wiki = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).flatMap(parse_article).filter(lambda x: x not in stop.value)
wiki.cache()

words = wiki.filter(lambda x:x != "END")

In [8]:
words_count = words.count()

words_probability = words.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).mapValues(lambda v:1.0*v/words_count)

In [3]:
indexed = wiki.zipWithIndex().map(lambda (k,v):(v,k))
indexed.cache()
shifted = indexed.map(lambda (k,v):(k+1,v))

pairs = shifted.join(indexed).map(lambda (k, v): v).filter(lambda pair: pair[0] !="END" and pair[1] != "END")
pairs.cache()

pairs_count = pairs.count()

In [4]:
pairs_probabiltity = pairs.map(lambda x: (x, 1)).reduceByKey(lambda x, y:x+y).filter(lambda (k, v): v > 500).map(lambda (k,v):(k,1.0*v/pairs_count))
pairs_probabiltity.cache()

PythonRDD[25] at RDD at PythonRDD.scala:48

In [5]:
first_word = pairs_probabiltity.map(lambda (pair, p): (pair[0], (pair, p))).join(words_probability)\
.map(lambda (k,v): (v[0][0],(v[0][1],v[1])))

second_word = pairs_probabiltity.map(lambda (pair, p): (pair[1], (pair, p))).join(words_probability)\
.map(lambda (k,v): (v[0][0],(v[0][1],v[1])))

probabilities = first_word.join(second_word).map(lambda (k,v): (k,(v[0][0],v[0][1],v[1][1])))

In [9]:
result = probabilities.mapValues(lambda (prob_pair, prob_a,prob_b): -1.0*log(prob_pair/(prob_a * prob_b))/log(prob_pair)).\
sortBy(lambda (k,v):-1*v).map(lambda (k,v): "%s_%s"%(k[0],k[1]))

for word in result.take(39):
    print word


los_angeles
external_links
united_states
prime_minister
san_francisco
et_al
new_york
supreme_court
19th_century
20th_century
references_external
soviet_union
air_force
baseball_player
university_press
roman_catholic
united_kingdom
references_reading
notes_references
award_best
north_america
new_zealand
civil_war
catholic_church
world_war
war_ii
south_africa
took_place
roman_empire
united_nations
american_singer-songwriter
high_school
american_actor
american_actress
american_baseball
york_city
american_football
years_later
north_american
