In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys

def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
os.environ['SPARK_HOME'] = os.path.join(spark_path,'spark-2.3')

You have pyspark version :  ['spark-2.3', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4']


In [4]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-2.3'

In [18]:
from os.path import join
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

C = F.col

SEED = 42


In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [7]:
spark = (Session
     .builder
     .appName('ml-utils')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

In [8]:
spark

# Vectors

In [9]:
from pyspark.ml.linalg import Vectors

In [10]:
# 1 create sparse vector from dict and list
data = [
    (0, Vectors.sparse(6, [0, 1, 2],[1.0, 1.0, 1.0])),
    (1, Vectors.sparse(6, {3:1.0, 4:1.0, 5:1.0}))
]

cols = ['id','features']

df = spark.createDataFrame(data, cols)
df.show(n=5, vertical=True, truncate=False)
df.printSchema()

-RECORD 0-----------------------------
 id       | 0                         
 features | (6,[0,1,2],[1.0,1.0,1.0]) 
-RECORD 1-----------------------------
 id       | 1                         
 features | (6,[3,4,5],[1.0,1.0,1.0]) 

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
# 2 create dense vector from python list

v = Vectors.dense([1, 2])
u = Vectors.dense([3, 4])

print(
    v + u,
    2 - v,
    v / 2,
    v * u,
    u / v,
    u % 2,
    -v,
    sep='\n'
)

[4.0,6.0]
[1.0,0.0]
[0.5,1.0]
[3.0,8.0]
[3.0,2.0]
[1.0,0.0]
[-1.0,-2.0]


In [49]:
# 3 convert vector into np.array and vice versa
# https://spark.apache.org/docs/2.3.4/api/python/pyspark.ml.html#module-pyspark.ml.linalg

# dense vector MLlib use Numpy array type
# sparse vector, scipy.sparse
v = Vectors.dense(np.array([1, 2]))
u = Vectors.dense(np.array([3, 4]))

np_v = v.toArray()
np_u = u.toArray()
print(
    u,
    v,
    np_v,
    np_u,
    type(np_u)
     )
    

[3.0,4.0] [1.0,2.0] [1. 2.] [3. 4.] <class 'numpy.ndarray'>


In [50]:
# 3 Converting embedding vector between python dict / pandas dataframe / pyspark dataframe
# pandas dataframe is a fake issue, np.array will not fit in dataframe well
embedding_size = 256

image_info = {
    'http://url_1.jpg' : np.random.random(size=embedding_size),
    'http://url_2.jpg' : np.random.random(size=embedding_size),
    'http://url_3.jpg' : np.random.random(size=embedding_size)
}

############# python dict to pyspark dataframe #################

data = [(url, Vectors.dense(vec)) for url, vec in image_info.items()]

col = ['url','embedding']

sdf = spark.createDataFrame(data, col)

sdf.printSchema()
sdf.show()

######### pyspark data frame to python dict #####################

# of course we need to collect the vectors ro driver

convert_image_info = {
    row.url : row.embedding.toArray()
    for row in 
    sdf.collect()
}

for url in convert_image_info.keys():
    print(np.isclose(convert_image_info[url], image_info[url]).all())


root
 |-- url: string (nullable = true)
 |-- embedding: vector (nullable = true)

+----------------+--------------------+
|             url|           embedding|
+----------------+--------------------+
|http://url_1.jpg|[0.38572632715821...|
|http://url_2.jpg|[0.24131721949581...|
|http://url_3.jpg|[0.29871468668232...|
+----------------+--------------------+

True
True
True


In [None]:
# 4 convert sparse vector from scipy / np into pyspark dataframe
# sparse vector
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
# from scipy.sparse import csr_matrix, coo_matrix

# row = np.array([0, 0, 1, 2, 2, 2])
# col = np.array([0, 2, 2, 0, 1, 2])
# data = np.array([1, 2, 3, 4, 5, 6])

# csr = csr_matrix((data, (row, col)), shape=(3, 3))

# coo = coo_matrix(matrix)

# for i,j,v in zip(coo.row, coo.col, coo.data):
#     print( "(%d, %d), %s" % (i,j,v))


In [11]:
# 5 Convert sparse vector to dense vector?
# 30 mins
# frequencyDenseVectors = frequencyVectors.map(lambda vector: DenseVector(vector.toArray()))


In [60]:
# 6 access the element in a column of vector

embedding_size = 3

image_info = {
    'http://url_1.jpg' : np.random.random(size=embedding_size),
    'http://url_2.jpg' : np.random.random(size=embedding_size),
    'http://url_3.jpg' : np.random.random(size=embedding_size)
}

############# python dict to pyspark dataframe #################

data = [(url, Vectors.dense(vec)) for url, vec in image_info.items()]

col = ['url','embedding']

sdf = spark.createDataFrame(data, col)

sdf.printSchema()
sdf.show(truncate=False)

##### sol 1 pyspark vector api suck ###########

def ith_(vector, i):
    try:
        return float(vector[i])
    except ValueError:
        return None

ith = F.udf(ith_, T.DoubleType())

##### sol 2 VectorSlicer ##########

from pyspark.ml.feature import VectorSlicer

slicer = VectorSlicer(inputCol='embedding', outputCol='by_slicer', indices=[0])

o_sdf = slicer.transform(sdf)

(
    sdf
    .withColumn('by_udf',ith("embedding", F.lit(0)))
).show(truncate=False)

o_sdf.show(truncate=False)

root
 |-- url: string (nullable = true)
 |-- embedding: vector (nullable = true)

+----------------+-----------------------------------------------------------+
|url             |embedding                                                  |
+----------------+-----------------------------------------------------------+
|http://url_1.jpg|[0.639014501169914,0.1791115748733353,0.7634177225718759]  |
|http://url_2.jpg|[0.9484816348336237,0.47793686383603895,0.5564647461191232]|
|http://url_3.jpg|[0.41118254425923606,0.6130700148483098,0.4367534799715509]|
+----------------+-----------------------------------------------------------+

+----------------+-----------------------------------------------------------+-------------------+
|url             |embedding                                                  |by_udf             |
+----------------+-----------------------------------------------------------+-------------------+
|http://url_1.jpg|[0.639014501169914,0.1791115748733353,0.763417722