In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys

def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
# os.environ['SPARK_HOME'] = os.path.join(spark_path,'spark-2.3')

# use spakr 3.1

os.environ['SPARK_HOME'] = '/opt/spark/versions/spark-3.1'

You have pyspark version :  ['spark-2.3', 'spark-3.1.2-bin-hadoop2.7', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4', 'spark-3.1']


In [4]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-3.1'

In [5]:
from os.path import join
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

C = F.col

SEED = 42


In [6]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

spark = (Session
     .builder
     .appName('utils')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

22/02/28 16:31:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/28 16:31:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [7]:
spark

# Numbers in column

In [8]:
# L2

In [9]:
# L2 distance

columns = ["x", "y"]
data = [
    (1,3),
    (2,4),
    (3,5)
]

(
    spark.createDataFrame(data, columns)
    .withColumn('distance',F.hypot(C("x"), C("y")))
).show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+---+------------------+
|  x|  y|          distance|
+---+---+------------------+
|  1|  3|3.1622776601683795|
|  2|  4|  4.47213595499958|
|  3|  5| 5.830951894845301|
+---+---+------------------+



                                                                                

In [10]:
# cosine similarity

columns = ['article_id',
           "keyword", #a.k.a. vector component
           "tfidf" #a.k.a vector value
          ]
data = [
    (1,'奇異博士',15),
    (1,'marvel',7),
    (1,'電影',4),
    (2,'marvel',8),
    (2,'奇異博士',10),
    (2,'電影',2),
    (2,'驚奇隊長',10),
    (3,'電影',3),
    (3,'蝙蝠俠',20),
    
]

article = W.partitionBy('article_id')

vec_sdf = (
    spark.createDataFrame(data, columns)
    .withColumnRenamed('keyword','component')
    .withColumn('vector_length',
                F.sum(F.pow(C("tfidf"),2))
                 .over(article))
    .withColumn('vector_length',F.sqrt('vector_length'))
)

vec_sdf.show()

safe_cosine_sim : C = (
    C("inner_prod") / (C("a_vector_length") * C("b_vector_length") + 1e-10)
)

pairs_sdf = (
    vec_sdf
    .select(
        'component',
            C('article_id').alias('a_article_id'),
            C('tfidf').alias('a_tfidf'),
            C('vector_length').alias('a_vector_length')
            )
    .join(
        (
            vec_sdf
            .select(
                'component',
                    C('article_id').alias('b_article_id'),
                    C('tfidf').alias('b_tfidf'),
                    C('vector_length').alias('b_vector_length')
                )
        ),
        on=['component'],
    )
    .where(C("a_article_id") != C("b_article_id"))
    .withColumn( 'inner_prod',
                F.sum(C("a_tfidf") * C("b_tfidf"))
                 .over(W.partitionBy('a_article_id'))
               )
#     .withColumn('inner_prod',)
    .withColumn('cosine_sim',safe_cosine_sim)
)


print(pairs_sdf.count())
pairs_sdf.orderBy(-C("cosine_sim")).show(vertical=True,truncate=False)
pairs_sdf.printSchema()



+----------+---------+-----+------------------+
|article_id|component|tfidf|     vector_length|
+----------+---------+-----+------------------+
|         1| 奇異博士|   15|17.029386365926403|
|         1|   marvel|    7|17.029386365926403|
|         1|     電影|    4|17.029386365926403|
|         3|     電影|    3|20.223748416156685|
|         3|   蝙蝠俠|   20|20.223748416156685|
|         2|   marvel|    8|  16.3707055437449|
|         2| 奇異博士|   10|  16.3707055437449|
|         2|     電影|    2|  16.3707055437449|
|         2| 驚奇隊長|   10|  16.3707055437449|
+----------+---------+-----+------------------+





10




-RECORD 0------------------------------
 component       | marvel              
 a_article_id    | 1                   
 a_tfidf         | 7                   
 a_vector_length | 17.029386365926403  
 b_article_id    | 2                   
 b_tfidf         | 8                   
 b_vector_length | 16.3707055437449    
 inner_prod      | 226                 
 cosine_sim      | 0.8106661576414668  
-RECORD 1------------------------------
 component       | 電影                
 a_article_id    | 1                   
 a_tfidf         | 4                   
 a_vector_length | 17.029386365926403  
 b_article_id    | 2                   
 b_tfidf         | 2                   
 b_vector_length | 16.3707055437449    
 inner_prod      | 226                 
 cosine_sim      | 0.8106661576414668  
-RECORD 2------------------------------
 component       | 奇異博士            
 a_article_id    | 1                   
 a_tfidf         | 15                  
 a_vector_length | 17.029386365926403  
 b_art



# Vectors

In [79]:
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector

In [12]:
# 1 create sparse vector from dict and list
data = [
    (0, Vectors.sparse(6, [0, 1, 2],[1.0, 1.0, 1.0])),
    (1, Vectors.sparse(6, {3:1.0, 4:1.0, 5:1.0}))
]

cols = ['id','features']

df = spark.createDataFrame(data, cols)
df.show(n=5, vertical=True, truncate=False)
df.printSchema()

-RECORD 0-----------------------------
 id       | 0                         
 features | (6,[0,1,2],[1.0,1.0,1.0]) 
-RECORD 1-----------------------------
 id       | 1                         
 features | (6,[3,4,5],[1.0,1.0,1.0]) 

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
# 2 create dense vector from python list

v = Vectors.dense([1, 2])
u = Vectors.dense([3, 4])

print(
    v + u,
    2 - v,
    v / 2,
    v * u,
    u / v,
    u % 2,
    -v,
    sep='\n'
)

[4.0,6.0]
[1.0,0.0]
[0.5,1.0]
[3.0,8.0]
[3.0,2.0]
[1.0,0.0]
[-1.0,-2.0]


In [14]:
# 3 convert vector into np.array and vice versa
# https://spark.apache.org/docs/2.3.4/api/python/pyspark.ml.html#module-pyspark.ml.linalg

# dense vector MLlib use Numpy array type
# sparse vector, scipy.sparse
v = Vectors.dense(np.array([1, 2]))
u = Vectors.dense(np.array([3, 4]))

np_v = v.toArray()
np_u = u.toArray()
print(
    u,
    v,
    np_v,
    np_u,
    type(np_u)
     )
    

[3.0,4.0] [1.0,2.0] [1. 2.] [3. 4.] <class 'numpy.ndarray'>


In [80]:
# THIS ONLY CREATE PYTHON STRING OBJECT
# POOR PYSPARK API
dense_vec = DenseVector(np.array([1, 2]))
sparse_vec = SparseVector(5, [0, 2], [1.0, 3.0])

print(dense_vec)
print(sparse_vec)

for mark,obj in zip([dense_vec,sparse_vec],['dense','sparse']):
    print(f'{mark} vector method and attrs')
    print()
    print(type(obj),dir(obj))
    print()

[1.0,2.0]
(5,[0,2],[1.0,3.0])
[1.0,2.0] vector method and attrs

<class 'str'> ['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper

In [109]:
# method of sparse vector
# https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.ml.linalg.SparseVector.html
# https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.ml.linalg.DenseVector.html
# dot

a = SparseVector(4, [1, 3], [3.0, 4.0])
b = SparseVector(4, [2], [1.0])

print(a.dot(a))
print(a.dot(b))

# norm

print(a.norm(1), print(a.norm(2)))

# numNonzeros

print(a.numNonzeros())

# squared_distance
print(a.squared_distance(a))
print(a.squared_distance(b))

# toArray()
print(a.toArray())

25.0
0.0
5.0
7.0 None
2
0.0
26.0
[0. 3. 0. 4.]


In [15]:
# 3 Converting embedding vector between python dict / pandas dataframe / pyspark dataframe
# pandas dataframe is a fake issue, np.array will not fit in dataframe well
embedding_size = 256

image_info = {
    'http://url_1.jpg' : np.random.random(size=embedding_size),
    'http://url_2.jpg' : np.random.random(size=embedding_size),
    'http://url_3.jpg' : np.random.random(size=embedding_size)
}

############# python dict to pyspark dataframe #################

data = [(url, Vectors.dense(vec)) for url, vec in image_info.items()]

col = ['url','embedding']

sdf = spark.createDataFrame(data, col)

sdf.printSchema()
sdf.show()

######### pyspark data frame to python dict #####################

# of course we need to collect the vectors ro driver

convert_image_info = {
    row.url : row.embedding.toArray()
    for row in 
    sdf.collect()
}

for url in convert_image_info.keys():
    print(np.isclose(convert_image_info[url], image_info[url]).all())


root
 |-- url: string (nullable = true)
 |-- embedding: vector (nullable = true)

+----------------+--------------------+
|             url|           embedding|
+----------------+--------------------+
|http://url_1.jpg|[0.22535035245629...|
|http://url_2.jpg|[0.00638243237756...|
|http://url_3.jpg|[0.98896816084838...|
+----------------+--------------------+

True
True
True


In [16]:
# 4 convert sparse vector from scipy / np into pyspark dataframe
# sparse vector
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix
# from scipy.sparse import csr_matrix, coo_matrix

# row = np.array([0, 0, 1, 2, 2, 2])
# col = np.array([0, 2, 2, 0, 1, 2])
# data = np.array([1, 2, 3, 4, 5, 6])

# csr = csr_matrix((data, (row, col)), shape=(3, 3))

# coo = coo_matrix(matrix)

# for i,j,v in zip(coo.row, coo.col, coo.data):
#     print( "(%d, %d), %s" % (i,j,v))


In [18]:
# 6 access the element in a column of vector

embedding_size = 3

image_info = {
    'http://url_1.jpg' : np.random.random(size=embedding_size),
    'http://url_2.jpg' : np.random.random(size=embedding_size),
    'http://url_3.jpg' : np.random.random(size=embedding_size)
}

############# python dict to pyspark dataframe #################

data = [(url, Vectors.dense(vec)) for url, vec in image_info.items()]

col = ['url','embedding']

sdf = spark.createDataFrame(data, col)

sdf.printSchema()
sdf.show(truncate=False)

##### sol 1 pyspark vector api suck ###########

def ith_(vector, i):
    try:
        return float(vector[i])
    except ValueError:
        return None

ith = F.udf(ith_, T.DoubleType())

##### sol 2 VectorSlicer ##########

from pyspark.ml.feature import VectorSlicer

slicer = VectorSlicer(inputCol='embedding', outputCol='by_slicer', indices=[0])

o_sdf = slicer.transform(sdf)

(
    sdf
    .withColumn('by_udf',ith("embedding", F.lit(0)))
).show(truncate=False)

o_sdf.show(truncate=False)

root
 |-- url: string (nullable = true)
 |-- embedding: vector (nullable = true)

+----------------+--------------------------------------------------------------+
|url             |embedding                                                     |
+----------------+--------------------------------------------------------------+
|http://url_1.jpg|[0.46216659110719005,0.45938098214983547,0.038087781028762424]|
|http://url_2.jpg|[0.6118475248691172,0.10542420267186592,0.4652360199396238]   |
|http://url_3.jpg|[0.855009243058626,0.5857817217020683,0.07805699066680649]    |
+----------------+--------------------------------------------------------------+

+----------------+--------------------------------------------------------------+-------------------+
|url             |embedding                                                     |by_udf             |
+----------------+--------------------------------------------------------------+-------------------+
|http://url_1.jpg|[0.4621665911071900

In [19]:
# 7 
# Spark sparse vector 2 scipy csr matrix
# https://stackoverflow.com/questions/40557577/pyspark-sparse-vectors-to-scipy-sparse-matrix

from pyspark.ml.linalg import SparseVector
from pyspark.rdd import PipelinedRDD
from operator import attrgetter

df = spark.sparkContext.parallelize([
    (SparseVector(5, [0, 2], [1.0, 3.0]), ),
    (SparseVector(5, [1], [4.0]), ),
    (SparseVector(5, [2], [1.0]), )
]).toDF(["features"])

df.show(truncate=False)

from scipy.sparse import vstack, csr_matrix
import numpy as np

def as_matrix(vec : SparseVector) -> csr_matrix:
    data, indices = vec.values, vec.indices
    shape = 1, vec.size
    return csr_matrix((data, indices, np.array([0, vec.values.size])), shape)


test_sparse_matrix = as_matrix(SparseVector(5, [0, 2], [1.0, 3.0]))
print(
    'test the typing : ',
    test_sparse_matrix.todense()
)
print()

mat : csr_matrix = (
    df
    .rdd
    .map(attrgetter("features"))
    .map(as_matrix)
    .reduce(lambda x, y: vstack([x, y]))
)

features : PipelinedRDD = df.rdd.map(attrgetter("features"))
mats : PipelinedRDD = features.map(as_matrix)
    
# mat : csr_matrix = mats.reduce(lambda x, y: vstack([x, y]))

print(
    type(features),
    type(mats),
    type(mat),
    mat.todense(),
    sep='\n'
)



+-------------------+
|features           |
+-------------------+
|(5,[0,2],[1.0,3.0])|
|(5,[1],[4.0])      |
|(5,[2],[1.0])      |
+-------------------+

test the typing :  [[1. 0. 3. 0. 0.]]

<class 'pyspark.rdd.PipelinedRDD'>
<class 'pyspark.rdd.PipelinedRDD'>
<class 'scipy.sparse.csr.csr_matrix'>
[[1. 0. 3. 0. 0.]
 [0. 4. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [20]:
# 8 create a sturct to sparse vector --> No built in api support

In [70]:
# 9 compute cosine of 2 sparse Vector

from itertools import combinations

from pyspark.ml import Pipeline

raw_data = [
    (0,SparseVector(5, [0, 2], [1.0, 3.0])),
    (1,SparseVector(5, [1], [4.0])),
    (2,SparseVector(5, [2], [1.0]))
]

# chain for flattern
# combinsations for build Choose 2 from N

data = []
for pair in combinations(raw_data,2):
    data.append(
        tuple((*pair[0],*pair[1]))
    )

df = (
    spark.createDataFrame(
        data,
        ['a_url_id','a_feature_vec','b_url_id','b_feature_vec']
    )
    )

df.show(truncate=False)

+--------+-------------------+--------+-------------+
|a_url_id|a_feature_vec      |b_url_id|b_feature_vec|
+--------+-------------------+--------+-------------+
|0       |(5,[0,2],[1.0,3.0])|1       |(5,[1],[4.0])|
|0       |(5,[0,2],[1.0,3.0])|2       |(5,[2],[1.0])|
|1       |(5,[1],[4.0])      |2       |(5,[2],[1.0])|
+--------+-------------------+--------+-------------+



In [71]:
# approach A
# build-in normalizer
# SUCK

try:   
    normalizers = []
    for col in ['a_feature_vec','b_feature_vec']:
        l2_norm = Normalizer()
        # due to poor spark api
        l2_norm.__setattr__('inputCol',col)
        l2_norm.__setattr__('outputCol',f'norm_{col}')
        normalizers.append(l2_norm)

    pipeline = Pipeline(stages=normalizers)

    model = pipeline.fit(df)

    model.transform(df)
    
except Exception as e:
    print(e)




Cannot recognize a pipeline stage of type <class 'pyspark.mllib.feature.Normalizer'>.


In [130]:
# another approach
from typing import List,Tuple
def safe_cosine(
    a : SparseVector,
    b : SparseVector,
    base=1e-10,
) -> float:
#     print(a.dot(b))
#     print(a.norm(2))
    return float(
        a.dot(b) / (a.norm(2) * b.norm(2) + base)
#         a.dot(b)
    )
(
    df
    .withColumn('cosine',
                F.udf(safe_cosine,T.DoubleType())
                ('a_feature_vec','b_feature_vec')
    )
).show()


+--------+-------------------+--------+-------------+------------------+
|a_url_id|      a_feature_vec|b_url_id|b_feature_vec|            cosine|
+--------+-------------------+--------+-------------+------------------+
|       0|(5,[0,2],[1.0,3.0])|       1|(5,[1],[4.0])|               0.0|
|       0|(5,[0,2],[1.0,3.0])|       2|(5,[2],[1.0])|0.9486832980205138|
|       1|      (5,[1],[4.0])|       2|(5,[2],[1.0])|               0.0|
+--------+-------------------+--------+-------------+------------------+



# Sampling

In [22]:
#  Stratified sampling in Spark

data = [
    ('a1','美味食記'),
    ('a2','美味食記'),
    ('a3','美味食記'),
    ('a4','美味食記'),
    ('a5','國內旅遊'),
    ('a6','國內旅遊'),
    ('a7','不設分類'),
    ('a8','不設分類'),
    ('a9','不設分類'),
]

sdf = (
    spark.createDataFrame(data,['link','category'])
)


# Taking 60% of each category into training set
# It's a approx solution

trn = sdf.sampleBy("category",fractions={
    '美味食記':0.6,
    '國內旅遊':0.6,
    '不設分類':0.6,
},seed=2)

tst = sdf.subtract(trn)

trn.show()
tst.show()

+----+--------+
|link|category|
+----+--------+
|  a1|    美味食記|
|  a2|    美味食記|
|  a3|    美味食記|
|  a4|    美味食記|
|  a5|    國內旅遊|
|  a7|    不設分類|
|  a8|    不設分類|
+----+--------+

+----+--------+
|link|category|
+----+--------+
|  a6|    國內旅遊|
|  a9|    不設分類|
+----+--------+

