In [1]:
import findspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

In [4]:
from pyspark.sql import *
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf

In [5]:
spark = SparkSession.builder.appName("take_home").getOrCreate()

# using smaller subset from the cookbook data as it was not running on my local machine

In [6]:
data = spark.sparkContext.wholeTextFiles("cook_trial")

In [7]:
#creating fields for name and column for easier manipulation
columns = "name content"

In [8]:
fields = [StructField(field_name, StringType(), True) for field_name in columns.split()]

In [9]:
spark.sparkContext.setSystemProperty('spark.executor.memory', '6g')

In [10]:
#creating a schema for the fields name and content
schema = StructType(fields)

In [11]:
#now lets strip the data into name and content and then feed it into a df
data_map = data.map(lambda x:Row(name=x[0], content=x[1].strip()))

In [12]:
# now lets use the mapped data to create the data frame
df = spark.createDataFrame(data_map,schema)

In [13]:
# now lets look at our data frame
df.show(2)

+--------------------+--------------------+
|                name|             content|
+--------------------+--------------------+
|file:/Users/ammar...|Henriette Davidis...|
|file:/Users/ammar...|Cooking in old Cr...|
+--------------------+--------------------+
only showing top 2 rows



In [14]:
# now lets define our get shingles method, in this we pass in the text and char_ngram
def get_shingles(text, char_ngram=5):
    """Create a set of overlapping character n-grams.
    
    Only full length character n-grams are created, that is the first character
    n-gram is the first `char_ngram` characters from text, no padding is applied.

    Each n-gram is spaced exactly one character apart.

    Parameters
    ----------

    text: str
        The string from which the character n-grams are created.

    char_ngram: int (default 5)
        Length of each character n-gram.
    """
    return list(set(text[head:head + char_ngram] for head in range(0, len(text) - char_ngram)))

In [15]:
# now lets create a udf to handle our contents of the data frame and create shingles from the content
udf = udf(lambda y : get_shingles(y),ArrayType(StringType()))

In [16]:
# now we will create a column shingles in our data frame using the udf above
df = df.withColumn("shingles",udf(df.content))


In [17]:
df.show(2)

+--------------------+--------------------+--------------------+
|                name|             content|            shingles|
+--------------------+--------------------+--------------------+
|file:/Users/ammar...|Henriette Davidis...|[k, bo, ted m, s-...|
|file:/Users/ammar...|Cooking in old Cr...|[k, bo, vin d, RN...|
+--------------------+--------------------+--------------------+
only showing top 2 rows



In [18]:
#now lets drop the content column for further processing
df_new = df.drop("content")

In [19]:
df_new.show(3)

+--------------------+--------------------+
|                name|            shingles|
+--------------------+--------------------+
|file:/Users/ammar...|[k, bo, ted m, s-...|
|file:/Users/ammar...|[k, bo, vin d, RN...|
|file:/Users/ammar...|[ted m,  Litt, s ...|
+--------------------+--------------------+
only showing top 3 rows



In [20]:
# now lets use count vectoriser to get the term frequency vector
from pyspark.ml.feature import CountVectorizer

In [21]:
from pyspark.mllib.feature import HashingTF

In [22]:
from pyspark.mllib.feature import IDF

In [23]:
#removing the words which appear in less than 5 documents
#cv = CountVectorizer(inputCol="shingles", outputCol="features", vocabSize=50000, minDF=5.0)

In [36]:
#fitting our data with file name and shingles to the count vectorizer
cv = CountVectorizer(inputCol="shingles", outputCol="features", vocabSize=50000, minDF=10.0)
model = cv.fit(df_new)
result = model.transform(df_new)

In [37]:
from pyspark.ml.feature import MinHashLSH

In [38]:
 mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345)

In [39]:
model = mh.fit(result)
#model.transform(df_new).head()

In [40]:
# we just need to transform the result using the minhash and then get the hashes with similar content
final = model.transform(result)

In [41]:
#now lets group the documents with same hashes together to know the documents which have similar content
final.createOrReplaceTempView("final")
grouped = spark.sql("select a.name as fileName,a.hashes as hashcode from final a  group by a.hashes,a.name order by a.hashes")
 

In [42]:
grouped.show()

+--------------------+-----------+
|            fileName|   hashcode|
+--------------------+-----------+
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
|file:/Users/ammar...|[[32321.0]]|
+--------------------+-----------+
only showing top 20 rows

