In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram, BucketedRandomProjectionLSH
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import StructType,StructField, StringType, ArrayType
from pyspark.sql.functions import udf, col

In [None]:
# Create spark session with increased memory
spark = (SparkSession.builder.master("local[*]")
    .appName("lsh").config("spark.driver.memory", "8g")
    .config("spark.executor.memory", "8g")
    .getOrCreate())
sc = spark.sparkContext

# Bucketing

In [None]:
# Split on tab and create a new column with rdd and split article_text into array of words
rdd = (sc.textFile('cleaned.txt')
        .map(lambda line: line.split('\t'))
        .map(lambda r: (r[0], r[1].split(" "))))

schema = StructType([
        StructField('id', StringType()),
        StructField('words', ArrayType(elementType=StringType()))
])

df = spark.createDataFrame(rdd, schema)


# # Make ngrams of size n
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngram_df = ngram.transform(df)

# # # # Countvectorizer
cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=1_000_000, minDF=2)
cv_model = cv.fit(ngram_df)
cv_df = cv_model.transform(ngram_df)

brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=1_000_000,
                                  numHashTables=100)
model = brp.fit(cv_df)

In [None]:
# Load text from review.txt file
text = open('review.txt', 'r').read().split(" ")
# add text to dataframe 
text_df = spark.createDataFrame([(text, )], ['words'])
# Find ngrams of text
text_ngram = ngram.transform(text_df)
# Countvectorize text
text_cv = cv_model.transform(text_ngram)
# Get the key
key = text_cv.first()["features"]

# Find the nearest neighbors
res = model.approxNearestNeighbors(cv_df, key, 10)

res.select(["id", "distCol"]).show()

In [None]:
model.approxSimilarityJoin(cv_df, text_cv, 1.5, distCol="EuclideanDistance")\
    .select(col("datasetA.id").alias("idA"),
            col("datasetB.id").alias("idB"),
            col("EuclideanDistance")).show()