In [3]:
import sys
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from pyspark.sql.types import *

In [5]:
def split_file(x):
    value=x.value.split('\t')
    return (value[0], value[1].split(' '))

In [6]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df = spark.read.text('cleaned.txt')
rdd = df.rdd.map(lambda x: split_file(x))
df2 = rdd.toDF().withColumnRenamed('_2', 'content').withColumnRenamed('_1', 'id')

hashingTF = HashingTF(inputCol="content", outputCol='features')
hashingTF.setNumFeatures(1000)

tf = hashingTF.transform(df2)

idf = IDF()
idf.setInputCol('features')
idf.setOutputCol('idf')
model = idf.fit(tf)
tf_idf = model.transform(tf)

22/03/18 08:37:09 WARN Utils: Your hostname, LAPTOP-MOK83Q9I resolves to a loopback address: 127.0.1.1; using 172.26.70.148 instead (on interface eth0)
22/03/18 08:37:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/18 08:37:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [7]:
tf_idf.rdd.cartesian()

DataFrame[id: string, content: array<string>, features: vector, idf: vector]

In [7]:
import pyspark.sql.functions as psf

In [11]:
dot_udf = psf.udf(lambda x,y: float(x.dot(y)/(x.norm(2) * y.norm(2))), DoubleType())
result = tf_idf.alias("i").join(tf_idf.alias("j"), psf.col("i.id") < psf.col("j.id"))\
    .select(
        psf.col("i.id").alias("i"), 
        psf.col("j.id").alias("j"), 
        dot_udf("i.idf", "j.idf").alias("dot"))\
    .sort("i", "j")

result.show()


[Stage 4:>                                                          (0 + 1) / 1]

+-----------+-------------+-------------------+
|          i|            j|                dot|
+-----------+-------------+-------------------+
|"0801.1913"|  "0806.3537"| 0.1413471072354091|
|"0801.1913"|  "0807.5065"|0.14126193913291646|
|"0801.1913"|  "0809.0691"|0.16296139678695296|
|"0801.1913"|  "0811.2070"|0.12773448168976206|
|"0801.1913"|  "0907.5423"| 0.1542216397172733|
|"0801.1913"|  "0908.1812"|0.08835149491310022|
|"0801.1913"|  "0909.1602"|0.10423572180671294|
|"0801.1913"|  "0911.1170"|0.13923590897626348|
|"0801.1913"|  "1001.0199"| 0.1435384027466791|
|"0801.1913"|  "1004.5347"| 0.1301095462729346|
|"0801.1913"|  "1009.3123"|0.12919244660378096|
|"0801.1913"|"1009.3123-1"|0.12919244660378096|
|"0801.1913"|  "1111.4135"|0.17119358828601075|
|"0801.1913"|  "1202.0294"|0.16028872885185458|
|"0801.1913"|  "1212.0086"| 0.2421147626224124|
|"0801.1913"|  "1307.2735"|0.12886586905122766|
|"0801.1913"|  "1309.3865"|0.14166832031039753|
|"0801.1913"|  "1311.0649"|0.22637302810

                                                                                

In [110]:
columns = ['id', 'sim_score']


In [116]:
data = []
for i, x in enumerate(tf_idf.take(5)):
    sim_score_list = []
    for y in tf_idf.take(5):
        sim_score = x.idf.dot(y.idf) / (x.idf.norm(2) * y.idf.norm(2))
        sim_score_list.append(float(sim_score))
    data.append((x.id, sim_score_list))



In [118]:
sim_df = spark.createDataFrame(data).toDF(*columns)
sim_df.take(1)

[Row(id='"hep-ph0205344"', sim_score=[1.0, 0.12423407925986085, 0.24118939728349553, 0.24687892577787154, 0.11215675867134679])]

In [119]:
tf_idf

DataFrame[id: string, content: array<string>, features: vector, idf: vector]

In [123]:
final = tf_idf.join(sim_df,["id"])
final.toPandas()

                                                                                

Unnamed: 0,id,content,features,idf,sim_score
0,"""hep-ph0205344""","[""the, standard, model, is, greatly, successfu...","(0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 0.0, ...","(0.0, 0.2947995402206448, 1.1420974006078486, ...","[1.0, 0.12423407925986085, 0.24118939728349553..."
1,"""astro-ph0612210""","[""the, study, of, the, formation, and, evoluti...","(0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","(0.0, 0.8843986206619344, 0.0, 0.0, 0.0, 0.0, ...","[0.12423407925986085, 1.0, 0.11745402368601676..."
2,"""1401.4918""","[""the, dynamical, density, matrix, renormaliza...","(0.0, 8.0, 0.0, 4.0, 17.0, 0.0, 1.0, 7.0, 2.0,...","(0.0, 2.3583963217651585, 0.0, 1.6646415888996...","[0.24118939728349553, 0.11745402368601676, 0.9..."
3,"""hep-ph9602267""","[""this, paper, explores, the, phenomenology, o...","(0.0, 3.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...","(0.0, 0.8843986206619344, 0.0, 0.4161603972249...","[0.24687892577787154, 0.08912597029620804, 0.1..."
4,"""1307.2735""","[""the, classical, method, of, adding, two, int...","(0.0, 1.0, 5.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, ...","(0.0, 0.2947995402206448, 5.710487003039243, 0...","[0.11215675867134679, 0.10707027723770982, 0.1..."
