### TFIDF Similarity with Spark

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession, SQLContext
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
    .getOrCreate()

In [2]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)
spark

In [6]:
wiki = sc.wholeTextFiles("s3a://zihe-public/articles/AA/wiki_00")

In [7]:
import re

def get_title(x):
    pattern = "\<doc\sid\=\"(\d+)\"(.*)title\=\"(.*)\"\>"
    pattern_re = re.compile(pattern)
    matches = pattern_re.search(x)
    if not matches:
        return ""
    return matches[3]

In [8]:
def get_content(x):
    pattern = "\<doc\sid\=\"(\d+)\"(.*)title\=\"(.*)\"\>\\n(.*?)\\n{2}"
    pattern_re = re.compile(pattern)
    matches = pattern_re.search(x)
    if not matches:
        return ""
    idx = matches.end(0)
    return x[idx:]

In [10]:
titles = wiki.flatMap(lambda x :(x[1].split('</doc>'))).map(lambda x : get_title(x))

In [11]:
titles2 = titles.collect()

In [12]:
titles2[0:3]

['Anarchism', 'Autism', 'Albedo']

In [13]:
pages = wiki.flatMap(lambda x :(x[1].split('</doc>'))).map(lambda x : get_content(x))

In [14]:
pages2 = pages.collect()

In [15]:
pages3 = [d.split() for d in pages2]

In [16]:
lst = [titles2, pages2]
len(lst[0])

38

In [17]:
l = list(zip(titles2,pages2))
len(l)

38

In [25]:
rawRDD = sc.parallelize(l)
rawRDD.top(1)

[('List of Atlas Shrugged characters',
  'This is a list of characters in Ayn Rand\'s 1957 novel "Atlas Shrugged."\n\nThe following are major characters from the novel.\n\nDagny Taggart is the protagonist of the novel. She is Vice-President in Charge of Operations for Taggart Transcontinental, under her brother, James Taggart. Given James\' incompetence, Dagny is responsible for all the workings of the railroad.\n\nFrancisco d\'Anconia is one of the central characters in "Atlas Shrugged", an owner by inheritance of the world\'s largest copper mining operation. He is a childhood friend, and the first love, of Dagny Taggart. A child prodigy of exceptional talents, Francisco was dubbed the "climax" of the d\'Anconia line, an already prestigious family of skilled industrialists. He was a classmate of John Galt and Ragnar Danneskjöld and student of both Hugh Akston and Robert Stadler. He began working while still in school, proving that he could have made a fortune without the aid of his fa

In [26]:
rawRDD2 = rawRDD.map(lambda line:(line[0],list(set(line[1].split(" ")))))
uniquetitles = rawRDD2.map(lambda x:x[0]).distinct().collect()

In [27]:
wordIDs = rawRDD2.flatMap(lambda x:x[1]).distinct().zipWithIndex().collectAsMap()
wordIDs = sc.broadcast(wordIDs)

In [28]:
## Experiment Spark Gensim
def parseCorpus(line):
    A = [(wordIDs.value[el],1) for el in line[1]]
    return A

corpse = rawRDD2.map(parseCorpus).collect()

In [29]:
import gensim
from gensim import corpora,models,similarities

dictionary = corpora.Dictionary(rawRDD2.map(lambda x:x[1]).collect())

In [31]:
keyword = "human behavior"
tfidf = models.TfidfModel(corpse) 
feature_count = len(dictionary.token2id)
kw_vector = dictionary.doc2bow(keyword.split())
index = similarities.SparseMatrixSimilarity(tfidf[corpse], num_features = feature_count)
sim = index[tfidf[kw_vector]]

In [32]:
for i in range(len(sim)):
    print('keyword is similar to text%d: %.4f' % (i + 1, sim[i]))

keyword is similar to text1: 0.0026
keyword is similar to text2: 0.0023
keyword is similar to text3: 0.0034
keyword is similar to text4: 0.0056
keyword is similar to text5: 0.0000
keyword is similar to text6: 0.0000
keyword is similar to text7: 0.0000
keyword is similar to text8: 0.0000
keyword is similar to text9: 0.0000
keyword is similar to text10: 0.0000
keyword is similar to text11: 0.0023
keyword is similar to text12: 0.0000
keyword is similar to text13: 0.0000
keyword is similar to text14: 0.0000
keyword is similar to text15: 0.0000
keyword is similar to text16: 0.0000
keyword is similar to text17: 0.0000
keyword is similar to text18: 0.0000
keyword is similar to text19: 0.0017
keyword is similar to text20: 0.0000
keyword is similar to text21: 0.0000
keyword is similar to text22: 0.0000
keyword is similar to text23: 0.0000
keyword is similar to text24: 0.0000
keyword is similar to text25: 0.0000
keyword is similar to text26: 0.0025
keyword is similar to text27: 0.0000
keyword is

In [92]:
import pandas as pd

l = [range(len(titles2)),titles2, sim]
lt = list(map(list, zip(*l)))
pdf = pd.DataFrame(lt, columns = ['id','title','similarity'])
pdf.head()

Unnamed: 0,id,title,similarity
0,0,Anarchism,0.003152
1,1,Autism,0.002772
2,2,Albedo,0.004141
3,3,A,0.006719
4,4,Alabama,0.0


In [93]:
from pyspark.sql.functions import col

df = sqlContext.createDataFrame(pdf)
df = df.sort(col("similarity").desc())  #### DO NOT SORT ON THE BIG DATASET ####

  PyArrow >= 0.8.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [95]:
import pyspark.sql.functions as func

df = df.select(col("id"),col("title"), 
                func.round(df["similarity"],4).alias("similarity"))

In [96]:
df.show(truncate = False)

+---+----------------------------+----------+
|id |title                       |similarity|
+---+----------------------------+----------+
|30 |Austroasiatic languages     |0.0322    |
|6  |Abraham Lincoln             |0.0148    |
|33 |Arithmetic mean             |0.007     |
|3  |A                           |0.0067    |
|2  |Albedo                      |0.0041    |
|35 |Animal Farm                 |0.0033    |
|27 |Animation                   |0.0032    |
|0  |Anarchism                   |0.0032    |
|25 |ASCII                       |0.0031    |
|1  |Autism                      |0.0028    |
|10 |Academy Awards              |0.0028    |
|32 |Andorra                     |0.0024    |
|36 |Amphibian                   |0.0021    |
|18 |Algeria                     |0.0021    |
|28 |Apollo                      |0.0018    |
|26 |Austin (disambiguation)     |0.0       |
|34 |American Football Conference|0.0       |
|22 |Alchemy                     |0.0       |
|24 |Astronomer                  |