In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys

def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
os.environ['SPARK_HOME'] = os.path.join(spark_path,'spark-2.3')

You have pyspark version :  ['spark-2.3', 'spark-3.1.2-bin-hadoop2.7', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4', 'spark-3.1']


In [4]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-2.3'

In [5]:
from os.path import join
import pandas as pd
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [7]:
spark = (Session
     .builder
     .appName('pyspark-challenge')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

In [8]:
spark

# Docs

[Introduction](https://spark.apache.org/docs/2.2.3/ml-features.html#minhash-for-jaccard-distance)

[api doc](https://spark.apache.org/docs/2.3.4/api/python/pyspark.ml.html#pyspark.ml.feature.MinHashLSH)


LSH used in clustering, approximate mearest neighbor search and outlier detection with large datasets

 # Min Hash for Jaccard Distance
 
 * Input data - shingling boolean vector
 * fit a model - build sig matrix
     * you can find similarity pairs by send a query or apply a join
     

In [9]:
from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors

In [10]:
# Vectors.sparse??

# Vector.sparse(sparse, [0,1,2], [1.0, 1.0, 1.0]) 
# means a 6 dimension vector [1.0, 1.0, 1.0, 0, 0, 0]
# you can give a two list or a dict to create Vector.sparse

MinHashLSH??

# inputCol=None, outputCol=None, seed=None, numHashTables=1
# where's the b and r ?
#Key means the query

In [11]:
#

dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
         (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
         (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
dfA = spark.createDataFrame(dataA, ["id", "features"])

dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
         (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
         (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
dfB = spark.createDataFrame(dataB, ["id", "features"])

key = Vectors.sparse(6, [1, 3], [1.0, 1.0])

# key = Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0])

mh = MinHashLSH(inputCol="features",
                outputCol="hashes",
                seed = 42,
                numHashTables=50)
model = mh.fit(dfA)

# Feature Transformation
# print("The hashed dataset where hashed values are stored in the column 'hashes':")
# model.transform(dfA).show(vertical=True,
#                           truncate=False)

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
print("Approximately joining dfA and dfB on distance smaller than 0.6:")
model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\
    .select(C("datasetA.id").alias("idA"),
            C("datasetB.id").alias("idB"),
            C("JaccardDistance")).show(vertical=True,truncate=False)

# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxNearestNeighbors(transformedA, key, 2)`
# It may return less than 2 rows when not enough approximate near-neighbor candidates are
# found.
print("Approximately searching dfA for 5 nearest neighbors of the key:")
model.approxNearestNeighbors(dfA, key, 5).show()

Approximately joining dfA and dfB on distance smaller than 0.6:
-RECORD 0--------------
 idA             | 1   
 idB             | 5   
 JaccardDistance | 0.5 
-RECORD 1--------------
 idA             | 0   
 idB             | 5   
 JaccardDistance | 0.5 
-RECORD 2--------------
 idA             | 1   
 idB             | 4   
 JaccardDistance | 0.5 
-RECORD 3--------------
 idA             | 2   
 idB             | 5   
 JaccardDistance | 0.5 

Approximately searching dfA for 5 nearest neighbors of the key:
+---+--------------------+--------------------+-------+
| id|            features|              hashes|distCol|
+---+--------------------+--------------------+-------+
|  0|(6,[0,1,2],[1.0,1...|[[-1.052712271E9]...|   0.75|
|  1|(6,[2,3,4],[1.0,1...|[[-6.60386174E8],...|   0.75|
+---+--------------------+--------------------+-------+



In [12]:
print(dir(mh))
print()
print(dir(model))
print()

print(mh.explainParams())
print(model.explainParams())


['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_model', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_fit', '_fit_java', '_from_java', '_input_kwargs', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', 'copy', 'explainParam', 'explainParams', 'extractParamMap', 'fit', 'fitMultiple'

## Bucketed Random Projection for L2

In [13]:
data = [(0, Vectors.dense([-1.0, -1.0 ]),),
        (1, Vectors.dense([-1.0, 1.0 ]),),
        (2, Vectors.dense([1.0, -1.0 ]),),
        (3, Vectors.dense([1.0, 1.0]),)]

df = spark.createDataFrame(data, ["id", "features"])


brp = BucketedRandomProjectionLSH(
    inputCol="features",
    outputCol="hashes",
    seed=12345,
    bucketLength = 1.0,
    numHashTables = 10)


model = brp.fit(df)

# model.getBucketLength()


# model.transform(df).show()

data2 = [(4, Vectors.dense([2.0, 2.0 ]),),
         (5, Vectors.dense([2.0, 3.0 ]),),
         (6, Vectors.dense([3.0, 2.0 ]),),
         (7, Vectors.dense([3.0, 3.0]),)]


df2 = spark.createDataFrame(data2, ["id", "features"])
model.approxNearestNeighbors(df2, Vectors.dense([1.0, 2.0]), 1).collect()

df.printSchema()
df2.printSchema()

model.approxSimilarityJoin(df, df2, 100, distCol="EuclideanDistance").select(
    C("datasetA.id").alias("idA"),
    C("datasetB.id").alias("idB"),
    C("EuclideanDistance")).show()


root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)

+---+---+------------------+
|idA|idB| EuclideanDistance|
+---+---+------------------+
|  1|  4|3.1622776601683795|
|  3|  5|  2.23606797749979|
|  3|  7|2.8284271247461903|
|  1|  7|  4.47213595499958|
|  2|  5| 4.123105625617661|
|  3|  6|  2.23606797749979|
|  3|  4|1.4142135623730951|
|  2|  6| 3.605551275463989|
|  0|  6|               5.0|
|  2|  4|3.1622776601683795|
|  0|  5|               5.0|
|  2|  7|  4.47213595499958|
+---+---+------------------+



# Scalibility