In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys

def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
os.environ['SPARK_HOME'] = os.path.join(spark_path,'spark-2.3')

You have pyspark version :  ['spark-2.3', 'spark-3.1.2-bin-hadoop2.7', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4', 'spark-3.1']


In [4]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-2.3'

In [5]:
from os.path import join
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

C = F.col

SEED = 42


In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [7]:
spark = (Session
     .builder
     .appName('ml-utils')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

In [8]:
spark

# Numbers in column

In [9]:
# L2

In [10]:
# L2 distance

columns = ["x", "y"]
data = [
    (1,3),
    (2,4),
    (3,5)
]

(
    spark.createDataFrame(data, columns)
    .withColumn('distance',F.hypot(C("x"), C("y")))
).show()

+---+---+------------------+
|  x|  y|          distance|
+---+---+------------------+
|  1|  3|3.1622776601683795|
|  2|  4|  4.47213595499958|
|  3|  5| 5.830951894845301|
+---+---+------------------+



# Vectors

In [11]:
from pyspark.ml.linalg import Vectors

In [12]:
# 1 create sparse vector from dict and list
data = [
    (0, Vectors.sparse(6, [0, 1, 2],[1.0, 1.0, 1.0])),
    (1, Vectors.sparse(6, {3:1.0, 4:1.0, 5:1.0}))
]

cols = ['id','features']

df = spark.createDataFrame(data, cols)
df.show(n=5, vertical=True, truncate=False)
df.printSchema()

-RECORD 0-----------------------------
 id       | 0                         
 features | (6,[0,1,2],[1.0,1.0,1.0]) 
-RECORD 1-----------------------------
 id       | 1                         
 features | (6,[3,4,5],[1.0,1.0,1.0]) 

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
# 2 create dense vector from python list

v = Vectors.dense([1, 2])
u = Vectors.dense([3, 4])

print(
    v + u,
    2 - v,
    v / 2,
    v * u,
    u / v,
    u % 2,
    -v,
    sep='\n'
)

[4.0,6.0]
[1.0,0.0]
[0.5,1.0]
[3.0,8.0]
[3.0,2.0]
[1.0,0.0]
[-1.0,-2.0]


In [14]:
# 3 convert vector into np.array and vice versa
# https://spark.apache.org/docs/2.3.4/api/python/pyspark.ml.html#module-pyspark.ml.linalg

# dense vector MLlib use Numpy array type
# sparse vector, scipy.sparse
v = Vectors.dense(np.array([1, 2]))
u = Vectors.dense(np.array([3, 4]))

np_v = v.toArray()
np_u = u.toArray()
print(
    u,
    v,
    np_v,
    np_u,
    type(np_u)
     )
    

[3.0,4.0] [1.0,2.0] [1. 2.] [3. 4.] <class 'numpy.ndarray'>


In [15]:
# 3 Converting embedding vector between python dict / pandas dataframe / pyspark dataframe
# pandas dataframe is a fake issue, np.array will not fit in dataframe well
embedding_size = 256

image_info = {
    'http://url_1.jpg' : np.random.random(size=embedding_size),
    'http://url_2.jpg' : np.random.random(size=embedding_size),
    'http://url_3.jpg' : np.random.random(size=embedding_size)
}

############# python dict to pyspark dataframe #################

data = [(url, Vectors.dense(vec)) for url, vec in image_info.items()]

col = ['url','embedding']

sdf = spark.createDataFrame(data, col)

sdf.printSchema()
sdf.show()

######### pyspark data frame to python dict #####################

# of course we need to collect the vectors ro driver

convert_image_info = {
    row.url : row.embedding.toArray()
    for row in 
    sdf.collect()
}

for url in convert_image_info.keys():
    print(np.isclose(convert_image_info[url], image_info[url]).all())


root
 |-- url: string (nullable = true)
 |-- embedding: vector (nullable = true)

+----------------+--------------------+
|             url|           embedding|
+----------------+--------------------+
|http://url_1.jpg|[0.25034416553269...|
|http://url_2.jpg|[0.20125242047691...|
|http://url_3.jpg|[0.64865693634310...|
+----------------+--------------------+

True
True
True


In [16]:
# 4 convert sparse vector from scipy / np into pyspark dataframe
# sparse vector
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
# from scipy.sparse import csr_matrix, coo_matrix

# row = np.array([0, 0, 1, 2, 2, 2])
# col = np.array([0, 2, 2, 0, 1, 2])
# data = np.array([1, 2, 3, 4, 5, 6])

# csr = csr_matrix((data, (row, col)), shape=(3, 3))

# coo = coo_matrix(matrix)

# for i,j,v in zip(coo.row, coo.col, coo.data):
#     print( "(%d, %d), %s" % (i,j,v))


In [17]:
# 5 Convert sparse vector to dense vector?
# 30 mins
# frequencyDenseVectors = frequencyVectors.map(lambda vector: DenseVector(vector.toArray()))


In [18]:
# 6 access the element in a column of vector

embedding_size = 3

image_info = {
    'http://url_1.jpg' : np.random.random(size=embedding_size),
    'http://url_2.jpg' : np.random.random(size=embedding_size),
    'http://url_3.jpg' : np.random.random(size=embedding_size)
}

############# python dict to pyspark dataframe #################

data = [(url, Vectors.dense(vec)) for url, vec in image_info.items()]

col = ['url','embedding']

sdf = spark.createDataFrame(data, col)

sdf.printSchema()
sdf.show(truncate=False)

##### sol 1 pyspark vector api suck ###########

def ith_(vector, i):
    try:
        return float(vector[i])
    except ValueError:
        return None

ith = F.udf(ith_, T.DoubleType())

##### sol 2 VectorSlicer ##########

from pyspark.ml.feature import VectorSlicer

slicer = VectorSlicer(inputCol='embedding', outputCol='by_slicer', indices=[0])

o_sdf = slicer.transform(sdf)

(
    sdf
    .withColumn('by_udf',ith("embedding", F.lit(0)))
).show(truncate=False)

o_sdf.show(truncate=False)

root
 |-- url: string (nullable = true)
 |-- embedding: vector (nullable = true)

+----------------+----------------------------------------------------------+
|url             |embedding                                                 |
+----------------+----------------------------------------------------------+
|http://url_1.jpg|[0.4538502149329343,0.6324346121623683,0.8467809749877175]|
|http://url_2.jpg|[0.863370284935342,0.8533806976561991,0.42447456330736744]|
|http://url_3.jpg|[0.5709444120698343,0.9080112046349297,0.8900165318600829]|
+----------------+----------------------------------------------------------+

+----------------+----------------------------------------------------------+------------------+
|url             |embedding                                                 |by_udf            |
+----------------+----------------------------------------------------------+------------------+
|http://url_1.jpg|[0.4538502149329343,0.6324346121623683,0.8467809749877175]|0.4

In [19]:
# 7 
# Spark sparse vector 2 scipy csr matrix
# https://stackoverflow.com/questions/40557577/pyspark-sparse-vectors-to-scipy-sparse-matrix

from pyspark.ml.linalg import SparseVector
from pyspark.rdd import PipelinedRDD
from operator import attrgetter

df = spark.sparkContext.parallelize([
    (SparseVector(5, [0, 2], [1.0, 3.0]), ),
    (SparseVector(5, [1], [4.0]), ),
    (SparseVector(5, [2], [1.0]), )
]).toDF(["features"])

df.show(truncate=False)

from scipy.sparse import vstack, csr_matrix
import numpy as np

def as_matrix(vec : SparseVector) -> csr_matrix:
    data, indices = vec.values, vec.indices
    shape = 1, vec.size
    return csr_matrix((data, indices, np.array([0, vec.values.size])), shape)


test_sparse_matrix = as_matrix(SparseVector(5, [0, 2], [1.0, 3.0]))
print(
    'test the typing : ',
    test_sparse_matrix.todense()
)
print()

mat : csr_matrix = (
    df
    .rdd
    .map(attrgetter("features"))
    .map(as_matrix)
    .reduce(lambda x, y: vstack([x, y]))
)

features : PipelinedRDD = df.rdd.map(attrgetter("features"))
mats : PipelinedRDD = features.map(as_matrix)
    
# mat : csr_matrix = mats.reduce(lambda x, y: vstack([x, y]))

print(
    type(features),
    type(mats),
    type(mat),
    mat.todense(),
    sep='\n'
)



+-------------------+
|features           |
+-------------------+
|(5,[0,2],[1.0,3.0])|
|(5,[1],[4.0])      |
|(5,[2],[1.0])      |
+-------------------+

test the typing :  [[1. 0. 3. 0. 0.]]

<class 'pyspark.rdd.PipelinedRDD'>
<class 'pyspark.rdd.PipelinedRDD'>
<class 'scipy.sparse.csr.csr_matrix'>
[[1. 0. 3. 0. 0.]
 [0. 4. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [20]:
# 8 create onehot sparse vector


# Sampling

In [30]:
#  Stratified sampling in Spark

data = [
    ('a1','美味食記'),
    ('a2','美味食記'),
    ('a3','美味食記'),
    ('a4','美味食記'),
    ('a5','國內旅遊'),
    ('a6','國內旅遊'),
    ('a7','不設分類'),
    ('a8','不設分類'),
    ('a9','不設分類'),
]

sdf = (
    spark.createDataFrame(data,['link','category'])
)


# Taking 60% of each category into training set
# It's a approx solution

trn = sdf.sampleBy("category",fractions={
    '美味食記':0.6,
    '國內旅遊':0.6,
    '不設分類':0.6,
},seed=2)

tst = sdf.subtract(trn)

trn.show()
tst.show()

+----+--------+
|link|category|
+----+--------+
|  a1|    美味食記|
|  a2|    美味食記|
|  a3|    美味食記|
|  a4|    美味食記|
|  a5|    國內旅遊|
|  a7|    不設分類|
|  a8|    不設分類|
+----+--------+

+----+--------+
|link|category|
+----+--------+
|  a6|    國內旅遊|
|  a9|    不設分類|
+----+--------+

