In [1]:
# import pyspark module
from pyspark import SparkContext
sc = SparkContext()

In [2]:
sc

In [3]:
# Data file at https://www.cse.ust.hk/msbd5003/data

lines = sc.textFile('data/adj_noun_pairs.txt')

In [4]:
lines.count()

3162692

In [5]:
lines.getNumPartitions()

2

In [6]:
lines.take(5)

['early radical',
 'french revolution',
 'pejorative way',
 'violent means',
 'positive label']

In [7]:
# Converting lines into word pairs. 
# Data is dirty: some lines have more than 2 words, so filter them out.
pairs = lines.map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
pairs.cache()

PythonRDD[4] at RDD at PythonRDD.scala:53

In [14]:
# 测试代码
print(lines.count())
test = lines.map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
print(test.count())

3162674

In [8]:
pairs.take(5)

[('early', 'radical'),
 ('french', 'revolution'),
 ('pejorative', 'way'),
 ('violent', 'means'),
 ('positive', 'label')]

In [9]:
N = pairs.count()

In [10]:
N

3162674

In [11]:
# Compute the frequency of each pair.
# Ignore pairs that not frequent enough
pair_freqs = pairs.map(lambda p: (p,1)).reduceByKey(lambda f1, f2: f1 + f2) \
                  .filter(lambda pf: pf[1] >= 100)

In [12]:
pair_freqs.take(5)

[(('political', 'philosophy'), 160),
 (('human', 'society'), 154),
 (('16th', 'century'), 950),
 (('first', 'man'), 166),
 (('same', 'time'), 2744)]

In [13]:
# Computing the frequencies of the adjectives and the nouns
a_freqs = pairs.map(lambda p: (p[0],1)).reduceByKey(lambda x,y: x+y)
n_freqs = pairs.map(lambda p: (p[1],1)).reduceByKey(lambda x,y: x+y)

In [14]:
a_freqs.take(5)

[('violent', 1191),
 ('positive', 2302),
 ('self-defined', 3),
 ('political', 15935),
 ('differ', 381)]

In [15]:
n_freqs.count()

106333

In [16]:
# Broadcasting the adjective and noun frequencies. 
#a_dict = a_freqs.collectAsMap()
#a_dict = sc.parallelize(a_dict).map(lambda x: x)

# broadcast to all the worker nodes

n_dict = sc.broadcast(n_freqs.collectAsMap())
a_dict = sc.broadcast(a_freqs.collectAsMap())
a_dict.value['violent']

1191

## PMI (pointwise mutual information, 点互信息)

机器学习相关文献中，可以看到使用PMI衡量两个变量之间的相关性，比如两个词，两个句子。原理公式为：

$$ 𝑝𝑚𝑖(𝑥,𝑦)=log⁡\frac{𝑝(𝑥,𝑦)}{𝑝(𝑥)𝑝(𝑦)}$$

在概率论中，如果x和y无关，p(x,y)=p(x)p(y)；如果x和y越相关，p(x,y)和p(x)p(y)的比就越大。从后两个条件概率可能更好解释，在y出现的条件下x出现的概率除以单看x出现的概率，这个值越大表示x和y越相关。

log取自信息论中对概率的量化转换（对数结果为负，一般要再乘以-1，当然取绝对值也是一样的）。

In [17]:
from math import *

# Computing the PMI for a pair.
def pmi_score(pair_freq):
    w1, w2 = pair_freq[0]
    f = pair_freq[1]
    pmi = log(float(f)*N/(a_dict.value[w1]*n_dict.value[w2]), 2)
    return pmi, (w1, w2)

In [18]:
# Computing the PMI for all pairs.
scored_pairs = pair_freqs.map(pmi_score)

In [19]:
# Printing the most strongly associated pairs. 
scored_pairs.top(10)

[(14.41018838546462, ('magna', 'carta')),
 (13.071365888694997, ('polish-lithuanian', 'Commonwealth')),
 (12.990597616733414, ('nitrous', 'oxide')),
 (12.64972604311254, ('latter-day', 'Saints')),
 (12.50658937509916, ('stainless', 'steel')),
 (12.482331020687814, ('pave', 'runway')),
 (12.19140721768055, ('corporal', 'punishment')),
 (12.183248694293388, ('capital', 'punishment')),
 (12.147015483562537, ('rush', 'yard')),
 (12.109945794428935, ('globular', 'cluster'))]