In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession, SQLContext
spark = SparkSession.builder.appName("project-similarity").getOrCreate()

In [2]:
sc = spark.sparkContext

In [3]:
spark

In [4]:
wiki = sc.wholeTextFiles("s3a://zihe-public/articles/AA/wiki_00")

In [5]:
import re

def get_title(x):
    pattern = "\<doc\sid\=\"(\d+)\"(.*)title\=\"(.*)\"\>"
    pattern_re = re.compile(pattern)
    matches = pattern_re.search(x)
    if not matches:
        return ""
    return matches[3]

In [6]:
def get_content(x):
    pattern = "\<doc\sid\=\"(\d+)\"(.*)title\=\"(.*)\"\>\\n(.*?)\\n{2}"
    pattern_re = re.compile(pattern)
    matches = pattern_re.search(x)
    if not matches:
        return ""
    idx = matches.end(0)
    return x[idx:]

In [7]:
titles = wiki.flatMap(lambda x :(x[1].split('</doc>'))).map(lambda x : get_title(x))

In [8]:
titles2 = titles.collect()

In [9]:
titles2[0:3]

['Anarchism', 'Autism', 'Albedo']

In [10]:
len(titles2)

38

In [11]:
pages = wiki.flatMap(lambda x :(x[1].split('</doc>'))).map(lambda x : get_content(x))

In [12]:
pages2 = pages.collect()

In [13]:
pages2[0:3]

['Anarchism is a political philosophy and movement that rejects all involuntary, coercive forms of hierarchy. It radically calls for the abolition of the state which it holds to be undesirable, unnecessary and harmful.\n\nThe timeline of anarchism stretches back to prehistory when people lived in anarchistic societies long before the establishment of formal states, kingdoms or empires. With the rise of organised hierarchical bodies, skepticism towards authority also rose, but it was not until the 19th century that a self-conscious political movement was formed. During the latest half of 19th and the first decades of 20th century, the anarchist movement flourished in most parts of the world and had a significant role in worker\'s struggles for emancipation. Various branches of anarchism were espoused during those times. Anarchists took part in several revolutions, most notably in the Spanish Civil War, where they were crushed by the fascist forces in 1939, marking the end of the classic

In [14]:
import gensim
from gensim import corpora, models, similarities

In [15]:
pages3 = [d.split() for d in pages2]

In [34]:
len(pages3[0])

5067

In [16]:
keyword = "human behavior"
dictionary = corpora.Dictionary(pages3)
corpus = [dictionary.doc2bow(text) for text in pages3]
tfidf = models.TfidfModel(corpus) 
feature_count = len(dictionary.token2id)
kw_vector = dictionary.doc2bow(keyword.split())

In [35]:
len(dictionary)

31214

In [36]:
len(dictionary.token2id)

31214

In [24]:
import pandas as pd
import pyspark.sql.functions as func

mydf = pd.DataFrame(corpus)
mydf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4461,4462,4463,4464,4465,4466,4467,4468,4469,4470
0,"(0, 1)","(1, 1)","(2, 1)","(3, 1)","(4, 1)","(5, 1)","(6, 1)","(7, 1)","(8, 2)","(9, 1)",...,,,,,,,,,,
1,"(70, 1)","(77, 12)","(79, 1)","(85, 2)","(98, 1)","(102, 3)","(122, 1)","(123, 1)","(152, 1)","(155, 2)",...,,,,,,,,,,
2,"(35, 1)","(45, 1)","(47, 1)","(48, 6)","(77, 3)","(98, 2)","(102, 3)","(105, 2)","(122, 1)","(179, 4)",...,,,,,,,,,,
3,"(52, 1)","(77, 6)","(78, 1)","(83, 1)","(96, 1)","(105, 1)","(122, 1)","(123, 2)","(154, 1)","(161, 7)",...,,,,,,,,,,
4,"(4, 1)","(35, 1)","(45, 2)","(50, 1)","(63, 1)","(65, 2)","(70, 3)","(71, 1)","(73, 1)","(74, 4)",...,,,,,,,,,,


In [44]:
sqlContext = SQLContext(sc)

In [27]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_count)
sim = index[tfidf[kw_vector]]

In [28]:
for i in range(len(sim)):
    print('keyword is similar to text%d: %.4f' % (i + 1, sim[i]))

keyword is similar to text1: 0.0047
keyword is similar to text2: 0.0429
keyword is similar to text3: 0.0000
keyword is similar to text4: 0.0000
keyword is similar to text5: 0.0000
keyword is similar to text6: 0.0000
keyword is similar to text7: 0.0019
keyword is similar to text8: 0.0081
keyword is similar to text9: 0.0000
keyword is similar to text10: 0.0000
keyword is similar to text11: 0.0000
keyword is similar to text12: 0.0000
keyword is similar to text13: 0.0000
keyword is similar to text14: 0.0000
keyword is similar to text15: 0.0476
keyword is similar to text16: 0.0107
keyword is similar to text17: 0.0000
keyword is similar to text18: 0.0000
keyword is similar to text19: 0.0084
keyword is similar to text20: 0.0042
keyword is similar to text21: 0.0512
keyword is similar to text22: 0.0000
keyword is similar to text23: 0.0068
keyword is similar to text24: 0.0000
keyword is similar to text25: 0.0000
keyword is similar to text26: 0.0000
keyword is similar to text27: 0.0000
keyword is

In [30]:
sorted(sim, reverse=True)[0:10]

[0.05123809,
 0.04759818,
 0.042934716,
 0.010661388,
 0.008412062,
 0.008132495,
 0.006835735,
 0.0046801227,
 0.0042036446,
 0.002890593]

In [37]:
l = [range(len(titles2)),titles2, sim]
lt = list(map(list, zip(*l)))
pdf = pd.DataFrame(lt, columns = ['id','title','similarity'])
pdf.head()

Unnamed: 0,id,title,similarity
0,0,Anarchism,0.00468
1,1,Autism,0.042935
2,2,Albedo,0.0
3,3,A,0.0
4,4,Alabama,0.0


In [43]:
from pyspark.sql.functions import col

df = sqlContext.createDataFrame(pdf)
df = df.sort(col("similarity").desc())  #### DO NOT SORT ON THE BIG DATASET ####

In [44]:
df = df.select(col("id"),col("title"), 
                func.round(df["similarity"],4).alias("similarity"))

In [46]:
df.show(truncate = False)

+---+----------------------------------------+----------+
|id |title                                   |similarity|
+---+----------------------------------------+----------+
|20 |Anthropology                            |0.0512    |
|14 |Altruism                                |0.0476    |
|1  |Autism                                  |0.0429    |
|15 |Ayn Rand                                |0.0107    |
|18 |Algeria                                 |0.0084    |
|7  |Aristotle                               |0.0081    |
|22 |Alchemy                                 |0.0068    |
|0  |Anarchism                               |0.0047    |
|19 |List of Atlas Shrugged characters       |0.0042    |
|32 |Andorra                                 |0.0029    |
|28 |Apollo                                  |0.0022    |
|35 |Animal Farm                             |0.0019    |
|6  |Abraham Lincoln                         |0.0019    |
|17 |Allan Dwan                              |0.0       |
|4  |Alabama  