In [2]:
# total : 58 problem and solutions

In [3]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [4]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [5]:
import os
import sys

def get_workstation_spark_path(where_are_you : str) -> str:
    if where_are_you == 'titan':
        return '/home/data/ryanchao2012/lib'
    elif where_are_you == 'thor':
        return '/opt/spark/versions'
    else:
        raise ValueError("wrong work station name")

spark_path = get_workstation_spark_path('thor')

print('You have pyspark version : ', os.listdir(spark_path))
os.environ['PYSPARK_PYTHON'] = sys.executable
# spark-2.3, spark-2.4
os.environ['SPARK_HOME'] = os.path.join(spark_path,'spark-2.3')

You have pyspark version :  ['spark-2.3', 'spark-3.0', 'spark-3.0.1-bin-hadoop2.7', 'spark-2.3.4-bin-hadoop2.7', 'spark-2.4.7-bin-hadoop2.7', 'spark-2.4']


In [6]:
os.environ['SPARK_HOME']

'/opt/spark/versions/spark-2.3'

In [7]:
from os.path import join
import pandas as pd
from pyspark.sql import SparkSession as Session
from pyspark.sql import DataFrame
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

In [8]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '4g')
    .set('spark.driver.maxResultSize', '1g')
   )

In [9]:
spark = (Session
     .builder
     .appName('pyspark-challenge')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

In [10]:
spark

# Creating DataFrame (16+)

In [11]:
# 0. know what spark session can do and its version
print(dir(spark), f'your spark version : {spark.version}'
      , sep='\n\n')

['Builder', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_conf', '_convert_from_pandas', '_createFromLocal', '_createFromRDD', '_create_from_pandas_with_arrow', '_get_numpy_record_dtype', '_inferSchema', '_inferSchemaFromList', '_instantiatedSession', '_jsc', '_jsparkSession', '_jvm', '_jwrapped', '_repr_html_', '_sc', '_wrapped', 'builder', 'catalog', 'conf', 'createDataFrame', 'newSession', 'range', 'read', 'readStream', 'sparkContext', 'sql', 'stop', 'streams', 'table', 'udf', 'version']

your spark version : 2.3.4


In [12]:
# # 1. read data from csv
# print(os.listdir('../data'))
# df_from_csv_1 = spark.read.csv('../data/zipcodes.csv',
#                                header=True,
#                               inferSchema=True)
# df_from_csv_1.printSchema()
# df_from_csv_1.limit(5).toPandas()

In [13]:
# # 2 read data from json
# print(os.listdir('../data'))
# print(dir(spark.read))
# # 沒有infer_schema
# df_from_json = spark.read.json('../data/zipcodes.json')
# df_from_json.printSchema()
# df_from_json.limit(5).toPandas()

In [14]:
# 3 create dataframe from rdd list
columns = ["language","user_counts"]
data = [
    ("Java","20000"),
    ("Python","100000"),
    ("Scala","3000")
       ]
# 先分散到rdd
# rdd = spark.sparkContext.parallelize(data)
# print(dir(rdd), type(rdd), sep='\n\n')
# print()
# df_from_rdd = rdd.toDF(schema=columns)
# df_from_rdd.show(n=5)

# 直接create，讓spark dataframe進行分散
df = spark.createDataFrame(data=data,schema=columns)
df.show(n=5)



+--------+-----------+
|language|user_counts|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [15]:
# 4 create 5 row fake data using spark range
print(dir(spark))
print(type(spark.range(start=0,end=10)))
columns = ['row_number']
single_column_df = spark.range(start=0,end=10)
single_column_df.show(n=5)

['Builder', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_conf', '_convert_from_pandas', '_createFromLocal', '_createFromRDD', '_create_from_pandas_with_arrow', '_get_numpy_record_dtype', '_inferSchema', '_inferSchemaFromList', '_instantiatedSession', '_jsc', '_jsparkSession', '_jvm', '_jwrapped', '_repr_html_', '_sc', '_wrapped', 'builder', 'catalog', 'conf', 'createDataFrame', 'newSession', 'range', 'read', 'readStream', 'sparkContext', 'sql', 'stop', 'streams', 'table', 'udf', 'version']
<class 'pyspark.sql.dataframe.DataFrame'>
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



In [16]:
# 6 create empty dataframe

columns = ["language","user_counts"]

# empty RDD + schema won't work
# df_1 = spark.createDataFrame(data=spark.sparkContext.emptyRDD(),
#                              schema=columns)
# df_1.show(n=5)

# empty list -> rdd -> df won't work
# df2 = spark.sparkContext.parallelize([]).toDF(columns)

# df3 = spark.createDataFrame([]) # won't work this version

In [17]:
# 7 get dataframe shape
print(df.count(), len(df.columns))

3 2


In [18]:
# 8 add const column to a existing dataframe

columns = ["language","user_counts"]
data = [
    ("Java","20000"),
    ("Python","100000"),
    ("Scala","3000")
       ]
df = spark.createDataFrame(data=data,schema=columns)
df = (
    df.withColumn("new_column",F.lit("ABC")) 
    # F.lit means literal, retrurn a column
)
df.show(n=5)

+--------+-----------+----------+
|language|user_counts|new_column|
+--------+-----------+----------+
|    Java|      20000|       ABC|
|  Python|     100000|       ABC|
|   Scala|       3000|       ABC|
+--------+-----------+----------+



In [19]:
# 9 add a row_id column from a exisiting dataframe
# https://stackoverflow.com/questions/53082891/adding-a-unique-consecutive-row-number-to-dataframe-in-pyspark
columns = ["language","user_counts"]
data = [
    ("Java","20000"),
    ("Python","100000"),
    ("Scala","3000")
       ]
df_1 = spark.createDataFrame(data=data,schema=columns)
df_1 = (
    df_1.withColumn("index", 
                  F.row_number().over(
                      W.orderBy(F.monotonically_increasing_id() - 1)
                  )
                 )
    # F.monotonically_increasing_id does not give 1 ~ N
    # So we use window function to work around
)
df_1.show(n=5)

# Mre clear way to do that
df_2 = spark.createDataFrame(data=data,schema=columns)
w = W.orderBy(F.lit('A'))
df_2 = (
    df_2.withColumn("row_num", F.row_number().over(w))
)
df_2.show(n=5)

+--------+-----------+-----+
|language|user_counts|index|
+--------+-----------+-----+
|    Java|      20000|    1|
|  Python|     100000|    2|
|   Scala|       3000|    3|
+--------+-----------+-----+

+--------+-----------+-------+
|language|user_counts|row_num|
+--------+-----------+-------+
|    Java|      20000|      1|
|  Python|     100000|      2|
|   Scala|       3000|      3|
+--------+-----------+-------+



In [20]:
# 10 add a random number to a exisit column
df = (
#     df.withColumn('random_number', F.when(F.rand() > 0.5, 1).otherwise(0))
        df.withColumn('random_number', F.rand())
)

df.show(n=5)

+--------+-----------+----------+-------------------+
|language|user_counts|new_column|      random_number|
+--------+-----------+----------+-------------------+
|    Java|      20000|       ABC|0.45733416683599093|
|  Python|     100000|       ABC| 0.9843752039051096|
|   Scala|       3000|       ABC| 0.9477803309870161|
+--------+-----------+----------+-------------------+



In [21]:
# 11 add a binary 0, 1 based on condition to an exisit column
df = (
    df.withColumn('binary_cut_05',F.when(F.rand() > 0.5, 1).otherwise(0))
)
df.limit(5).toPandas()

Unnamed: 0,language,user_counts,new_column,random_number,binary_cut_05
0,Java,20000,ABC,0.457334,0
1,Python,100000,ABC,0.984375,0
2,Scala,3000,ABC,0.94778,1


In [22]:
# 12 create a dataframe contains row_index and fake data

columns = ["row_id","language","user_counts"]
data = [
    (0, "Java","20000"),
    (1, "Python","100000"),
    (2, "Scala","3000")
       ]

df = spark.createDataFrame(data=data, schema=columns)
df.show(n=5)


+------+--------+-----------+
|row_id|language|user_counts|
+------+--------+-----------+
|     0|    Java|      20000|
|     1|  Python|     100000|
|     2|   Scala|       3000|
+------+--------+-----------+



In [23]:
# 13 construct a complex data for spark dataframe
# using StructType

# Case 1
data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 4000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)
df.show(n=5)

# Case 2

struct_data = [
    (("James","","Smith"),"36636","M", 3000),
    (("Michael","Rose",""),"40288","M", 4000),
    (("Robert","","Williams"),"42114","M", 4000),
    (("Maria","Anne","Jones"),"39192","F", 4000),
    (("Jen","Mary","Brown"),"","F", -1)
]

structure_schema = StructType([
    StructField('name',
        StructType([
            StructField("firstname",StringType(), True), # Nullable True
            StructField("middlename",StringType(), True),
            StructField("lastname",StringType(), True),
    ])),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])

df = spark.createDataFrame(data=struct_data, schema=structure_schema)
df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    [James, , Smith]|36636|     M|  3000|
|   [Michael, Rose, ]|40288|     M|  4000|
|[Robert, , Williams]|42114|     M|  4000|
|[Maria, Anne, Jones]|39192|     F|  4000|
|  [Jen, Mary, Brown]|     |     F|    -1|
+--------------------+-----+------+------+



In [24]:
# 14 construct a complex data for spark dataframe
# using ArratyType
# https://github.com/spark-examples/pyspark-examples/blob/master/pyspark-array-string.py

columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
    ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ("Robert,,Williams",["CSharp","VB"],"NV")]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

df.select(df.name, F.explode(df.languagesAtSchool)).show(n=5)

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+----------------+------------------+------------+
|name            |languagesAtSchool |currentState|
+----------------+------------------+------------+
|James,,Smith    |[Java, Scala, C++]|CA          |
|Michael,Rose,   |[Spark, Java, C++]|NJ          |
|Robert,,Williams|[CSharp, VB]      |NV          |
+----------------+------------------+------------+

+-------------+-----+
|         name|  col|
+-------------+-----+
| James,,Smith| Java|
| James,,Smith|Scala|
| James,,Smith|  C++|
|Michael,Rose,|Spark|
|Michael,Rose,| Java|
+-------------+-----+
only showing top 5 rows



In [25]:
# 15 construct a complex data for spark dataframe
# using MapType
# https://github.com/spark-examples/pyspark-examples/blob/master/pyspark-explode-array-map.py
data = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})
]

columns = ['name','language','peoperties']

df = spark.createDataFrame(data=data, schema=columns)

df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- language: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- peoperties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+--------------+--------------------+
|      name|      language|          peoperties|
+----------+--------------+--------------------+
|     James| [Java, Scala]|[eye -> brown, ha...|
|   Michael|[Spark, Java,]|[eye ->, hair -> ...|
|    Robert|    [CSharp, ]|[eye -> , hair ->...|
|Washington|          null|                null|
| Jefferson|        [1, 2]|                  []|
+----------+--------------+--------------------+



In [26]:
# 16 create a nested array-type dataframe
# https://github.com/spark-examples/pyspark-examples/blob/master/pyspark-explode-nested-array.py

In [27]:
# 17 create a datetime column for spark dataframe

In [28]:
# 18 read a csv with array-of-string schema (fake issue)
# you should read it from json


columns = ["languagesAtSchool","currentState"]
data = [
    (["Java","Scala","C++"],"CA"), 
    (["Spark","Java","C++"],"NJ"),
    (["CSharp","VB"],"NV")
]

df = spark.createDataFrame(data=data,schema=columns)

df.show()

# we write the data into csv
SAVE = True
if SAVE:
    df.toPandas().to_csv('tmp.csv',index=False)

# You will get sting
(
    spark.read.csv('tmp.csv',inferSchema=True, header=True).printSchema()
)

# Convert it by schema? - No, not supported...

schema = (
    T.StructType()
    .add("languagesAtSchool", T.ArrayType(T.StringType()), True)
    .add("currentState", T.StringType(), True)
)
try:
    (
        spark.read.csv('tmp.csv',schema=schema, header=True).show()
    )
except Exception as e:
#     print(e)
    # short answer
    print('java.lang.UnsupportedOperationException: CSV data source does not support array<string> data type.')

# How about use F.json to convert that?

try:
    (
        spark.read.csv('tmp.csv',inferSchema=True, header=True)
        .cache()
        .withColumn('languagesAtSchool',F.from_json(C("languagesAtSchool"),'array<string>'))
    ).show()
except Exception as e:
#     print(e)
    # ashort answer
    print("java.lang.UnsupportedOperationException: CSV data source does not support array<string> data type.")
    
    
# convert it into json, make your life easiler

SAVE_JSON = True
if SAVE_JSON:
    pd.read_csv('tmp.csv').to_json('tmp.json',orient='records',force_ascii=False)

# Amazing!
spark.read.json('tmp.json').show()

+------------------+------------+
| languagesAtSchool|currentState|
+------------------+------------+
|[Java, Scala, C++]|          CA|
|[Spark, Java, C++]|          NJ|
|      [CSharp, VB]|          NV|
+------------------+------------+

root
 |-- languagesAtSchool: string (nullable = true)
 |-- currentState: string (nullable = true)

java.lang.UnsupportedOperationException: CSV data source does not support array<string> data type.
java.lang.UnsupportedOperationException: CSV data source does not support array<string> data type.
+------------+--------------------+
|currentState|   languagesAtSchool|
+------------+--------------------+
|          CA|['Java', 'Scala',...|
|          NJ|['Spark', 'Java',...|
|          NV|    ['CSharp', 'VB']|
+------------+--------------------+



# Column Operations (8+)

In [29]:
# 1 create new column based on original column

columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data=data, schema=columns)
df = (
    df.withColumn("user_count_100", C("user_counts") * 100)\
    .withColumn("user_count_log", F.log10(C("user_counts")))
)

df.show(n=5)

+--------+-----------+--------------+------------------+
|language|user_counts|user_count_100|    user_count_log|
+--------+-----------+--------------+------------------+
|    Java|      20000|       2000000| 4.301029995663981|
|  Python|     100000|      10000000|               5.0|
|   Scala|       3000|        300000|3.4771212547196626|
+--------+-----------+--------------+------------------+



In [30]:
# 2 rename, drop, add constant column to existing dataframe
# https://stackoverflow.com/questions/34077353/how-to-change-dataframe-column-names-in-pyspark
columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)
df = (
    df.withColumn('const_col',F.lit('ABC'))\
    .withColumnRenamed("language","lang")\
    .drop("user_counts")
)
df.show(n=5)

+------+---------+
|  lang|const_col|
+------+---------+
|  Java|      ABC|
|Python|      ABC|
| Scala|      ABC|
+------+---------+



In [31]:
# 3 know what method and attribute can be called with column object


print(type(C("language")), dir(C("language")), sep='\n\n')

# alias - 可以換名字
# asc, desc - 可以排序
# astype,cast - 可以轉型
# between - 可以傳入start_date以及end_date過濾
# bitwiseAND, bitwiseOR, bitwiseXOR - 可以做布林運算
# contains - 可以做字串搜尋
# endwith, startwith, rlike, substring - 可以做字串比對
# eqNullSafe, isNotNull, isNull - 可以檢查null值，Python須以None傳入
# isin, like - 可以做值的比對(數值，字串值)
# name - 可以取得欄位名稱
# when, otherwise - 可以做條件判斷


<class 'pyspark.sql.column.Column'>

['__add__', '__and__', '__bool__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__invert__', '__iter__', '__le__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rsub__', '__rtruediv__', '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '_asc_doc', '_bitwiseAND_doc', '_bitwiseOR_doc', '_bitwiseXOR_doc', '_contains_doc', '_desc_doc', '_endswith_doc', '_eqNullSafe_doc', '_isNotNull_doc', '_isNull_doc', '_jc', '_like_doc', '_rlike_doc', '_startswith_doc', 'alias', 'asc', 'astype', 'between', 'bitwiseAND', 'bitwiseOR', 'bitwiseXOR

In [32]:
# +=*/

# df = spark.createDataFrame([(6,3), (7, 3), (13,6), (5, 0)], ["x", "y"])

# df = (
#     df.withColumn("mod_cross_col", C("x") % C("y"))
#     df.withColumn("mod_contant", C("x") )
#      )
# df.show()

In [33]:
# 4 create a new dynamic column(if else condition based on old column)
columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)

df = (
    df.withColumn("is_many_users",
                  F.when(C('user_counts') > 50000, 1).otherwise(0)
                 )
)

df.show(n=5)

+--------+-----------+-------------+
|language|user_counts|is_many_users|
+--------+-----------+-------------+
|    Java|      20000|            0|
|  Python|     100000|            1|
|   Scala|       3000|            0|
+--------+-----------+-------------+



In [34]:
# 5 create a new dynamic column(if else condition based on old column) plus empty string replacement

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 4000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

# we cannot compare column values with empty string
# so the work-around method is replace empty string to null
# then using isNotNull()
is_full_name_exist = (C("firstname").isNotNull() & C("middlename").isNotNull() & C("lastname").isNotNull())


def blank_as_null(x):
    """
    helper function for converting row value from empty string to null
    https://stackoverflow.com/questions/33287886/replace-empty-strings-with-none-null-values-in-dataframe
    """
    return F.when(C(x) != "", C(x)).otherwise(None)

print("Before")
df.show(n=5)
df = (
    df.withColumn("firstname", blank_as_null("firstname"))\
    .withColumn("middlename", blank_as_null("middlename"))\
    .withColumn("lastname", blank_as_null("lastname"))\
    .withColumn("full_name",
                  F.when(
                      is_full_name_exist,
                      F.concat(C("firstname"), F.lit(' '),
                               C("middlename"), F.lit(' '),
                               C("lastname"))
                  ).otherwise(F.lit('N/A'))
                 )
)
print("After")
df.show(n=5)

Before
+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+

After
+---------+----------+--------+-----+------+------+----------------+
|firstname|middlename|lastname|   id|gender|salary|       full_name|
+---------+----------+--------+-----+------+------+----------------+
|    James|      null|   Smith|36636|     M|  3000|             N/A|
|  Michael|      Rose|    null|40288|     M|  4000|             N/A|
|   Robert|      null|Williams|42114|     M|  4000|             N/A|
|    Maria|      Anne|   Jones|39192|     F|  4000|Maria Anne Jones|
|      Jen|      Mary|   Brown|    

In [35]:
# 6 select columns which also exist on another dataframe

columns_1 = ["language", "user_counts"]
data_1 = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

columns_2 = ["language", "user_counts","note"]
data_2 = [
    ("Java",20000,"nothing"),
    ("Python",100000,"nothing"),
    ("Scala",3000,"nothing")
]

df_1 = spark.createDataFrame(data_1, columns_1)
df_2 = spark.createDataFrame(data_2, columns_2)

# union columns
same_cols = [F.col(c) for c in df_2.columns if c in df_1.columns]
print(same_cols, type(same_cols), type(same_cols[0]))

df_same_col = df_1.select(*same_cols)
df_same_col.show(n=5)

[Column<b'language'>, Column<b'user_counts'>] <class 'list'> <class 'pyspark.sql.column.Column'>
+--------+-----------+
|language|user_counts|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [36]:
# 7 create a const numerical column


columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)

df = (
    df.withColumn("const",F.lit(10000))\
    .withColumn("scientific_sign_1", F.lit(1e40))
    .withColumn("scientific_sign_2", F.lit(1e-40))
)

df.show(n=5)
df.printSchema()

+--------+-----------+-----+-----------------+-----------------+
|language|user_counts|const|scientific_sign_1|scientific_sign_2|
+--------+-----------+-----+-----------------+-----------------+
|    Java|      20000|10000|           1.0E40|          1.0E-40|
|  Python|     100000|10000|           1.0E40|          1.0E-40|
|   Scala|       3000|10000|           1.0E40|          1.0E-40|
+--------+-----------+-----+-----------------+-----------------+

root
 |-- language: string (nullable = true)
 |-- user_counts: long (nullable = true)
 |-- const: integer (nullable = false)
 |-- scientific_sign_1: double (nullable = false)
 |-- scientific_sign_2: double (nullable = false)



In [37]:
# 8 create a random variable column
# https://spark.apache.org/docs/2.3.4/api/python/pyspark.sql.html#pyspark.sql.DataFrame
# rand uniform [0, 1]
# randn Normal distribution mu = 0, sigma = 1

columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)

df = (
    df.withColumn("uniform_0_1",F.rand(seed=42))\
      .withColumn("uniform_0_100",100 * F.rand(seed=42))\
      .withColumn("normal_0_1", F.randn(seed=42))
)

df.show(n=5)
df.printSchema()

+--------+-----------+-------------------+------------------+-------------------+
|language|user_counts|        uniform_0_1|     uniform_0_100|         normal_0_1|
+--------+-----------+-------------------+------------------+-------------------+
|    Java|      20000| 0.6661236774413726| 66.61236774413726| 0.4085363219031828|
|  Python|     100000| 0.3856203005100328| 38.56203005100328|-0.7556247885860078|
|   Scala|       3000|0.27636619934035966|27.636619934035966|-1.4773884185536659|
+--------+-----------+-------------------+------------------+-------------------+

root
 |-- language: string (nullable = true)
 |-- user_counts: long (nullable = true)
 |-- uniform_0_1: double (nullable = false)
 |-- uniform_0_100: double (nullable = false)
 |-- normal_0_1: double (nullable = false)



In [38]:
# 9 convert pyspark dataframe column to a python list
# write a def func

columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)
df.show()

row_list = df.select("language").collect()
language_list = [row.language for row in row_list]
print(language_list)

+--------+-----------+
|language|user_counts|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+

['Java', 'Python', 'Scala']


In [39]:
# 10 check NA for all columns
def calculate_null(sdf):
    return sdf.select([F.count(F.when(F.isnan(c), c)).alias(c)
                for c in sdf.columns])
    


columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)
df.show()

calculate_null(df).show()

calculate_null(df).printSchema()

+--------+-----------+
|language|user_counts|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+

+--------+-----------+
|language|user_counts|
+--------+-----------+
|       0|          0|
+--------+-----------+

root
 |-- language: long (nullable = false)
 |-- user_counts: long (nullable = false)



In [40]:
# 11 fillna in columns

columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

# df.withColumn('new_column', lit(None).cast(StringType()))
df = (
    spark.createDataFrame(data,columns)
    .withColumn("user_counts", F.lit(None).cast(T.IntegerType()))
    .withColumn("fillme", F.lit(None).cast(T.StringType()))
).na.fill({
        'user_counts' : 50,
        'fillme' : "FillingString" 
    })

df.show()

+--------+-----------+-------------+
|language|user_counts|       fillme|
+--------+-----------+-------------+
|    Java|         50|FillingString|
|  Python|         50|FillingString|
|   Scala|         50|FillingString|
+--------+-----------+-------------+



# String Operation (0+)

In [41]:
# 1 string concat two column values to a new column from an existing dataframe

In [42]:
# 2 cut of left 3 char of specific column to a new column from an existing dataframe

In [43]:
# 3 convert string type column to int/float type column

In [44]:
# 4 convert string type column to datetime type column

# Filtering

## Numerical filtering (6+)

In [45]:
# 1. filter on equal condition

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 4000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.filter(C("gender") == "M")
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
+---------+----------+--------+-----+------+------+



In [46]:
# 2 filter on >, <, >=, <=

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.filter(C("salary") >= 4000)
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  6000|
+---------+----------+--------+-----+------+------+



In [47]:
# 3 multiple conditions require parenthese around each condition


data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

# This is lazy computing
rich_man_who_worth_married = (
    (C("gender") == "M") &
    (C("salary") >= 4000)
)

df = (
    df.filter(rich_man_who_worth_married)
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
+---------+----------+--------+-----+------+------+



In [48]:
# 4 Compare against a list of allowed values


data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.filter(C("gender").isin(["F"]))
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    Maria|      Anne|   Jones|39192|     F|  6000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [49]:
# 5 Sort result

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)


df = df.orderBy(C("salary").desc())

df.show(n=5)

df = df.orderBy(C("salary").asc())

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    Maria|      Anne|   Jones|39192|     F|  6000|
|   Robert|          |Williams|42114|     M|  4000|
|  Michael|      Rose|        |40288|     M|  4000|
|    James|          |   Smith|36636|     M|  3000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|      Jen|      Mary|   Brown|     |     F|    -1|
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  6000|
+---------+----------+--------+-----+------+------+



In [50]:
# 6 select distinct rows based on certain column but keep first row
# In this case, model prediction to filter the same images

############## DROP DUPLICATED doesn't work in this case
# https://stackoverflow.com/questions/38687212/spark-dataframe-drop-duplicates-and-keep-first
data = [
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,0.97611,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,0.93422,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,0.94231,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','pred','img_url']

df = spark.createDataFrame(data=data, schema=columns)
print('before')

df.orderBy('pred').show(n=5)

print('Sol 1 FAILED SOMETIMES WHEN PARTITION != 1')
df_1 = (
    df.drop_duplicates(subset=["pred"])
)

df_1.orderBy('pred').show(n=5)


############## Using Window Function and sort, rank, worked!
# You can check the 5 th question of Aggregation, The solution is the same

print('Sol 2 WORKED WITH ANY PARTITION')
df_2 = (
    df.withColumn("rank_by_pred",
                  F.row_number().over(
                  W.partitionBy("pred")\
                      .orderBy(F.desc("pred"))
                  )
                 )\
    .filter(F.col("rank_by_pred") == 1)\
    .drop('rank_by_pred')
)
df_2.orderBy('pred').show(n=5)

before
+----------+-------+--------------------+
|article_id|   pred|             img_url|
+----------+-------+--------------------+
|     67789|0.93422|https://pic.pimg....|
|     67789|0.94231|https://pic.pimg....|
|     14431|0.97611|https://pic.pimg....|
|     14431|0.99834|https://pic.pimg....|
|     14431|0.99834|https://pic.pimg....|
+----------+-------+--------------------+

Sol 1 FAILED SOMETIMES WHEN PARTITION != 1
+----------+-------+--------------------+
|article_id|   pred|             img_url|
+----------+-------+--------------------+
|     67789|0.93422|https://pic.pimg....|
|     67789|0.94231|https://pic.pimg....|
|     14431|0.97611|https://pic.pimg....|
|     14431|0.99834|https://pic.pimg....|
+----------+-------+--------------------+

Sol 2 WORKED WITH ANY PARTITION
+----------+-------+--------------------+
|article_id|   pred|             img_url|
+----------+-------+--------------------+
|     67789|0.93422|https://pic.pimg....|
|     67789|0.94231|https://pic.pi

## String filtering (2+)

In [51]:
# 1. regax filtering
# pyspark regexp_extract api cannot get all the groups
data = [
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg mfkvmdfasx'),
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpgsd,clsd,cluah'),
    (0,0.78,'src="https://pic.pimg.tw/happy78/1528543947-2503982521_n.jpg" title="IMG_5216.jpg"'),
    (1,0.45,'https://s.yimg.com/bt/api/res/1.2/OzjFf8Ov8yUBECUAWKMcDw--/YXBwaWQ9eW5ld3NfbGVnbztxPTg1O3c9NjAw/http://media.zenfs.com/zh_hant_tw/News/stormmedia/20160601-093346_U720_M161728_0c0b.jpg'),
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,0.97611,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,0.93422,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,0.94231,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg'),
    (67789,0.94111,'https://pic.pimg.tw/happy78/1528543962-45890_n.png'),
    (67789,0.94111,'png')
]

columns = ['article_id','pred','img_url']


df = spark.createDataFrame(data, columns)


df_filted = (
    df.withColumn("img_url", F.regexp_extract(C('img_url'),
                                              r'(http\S+jpg\b)|(http\S+png\b)',
                                              0))
)

df_filted.show(n=20, vertical=True, truncate=False)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 article_id | 14431                                                                                                                                                                                   
 pred       | 0.99834                                                                                                                                                                                 
 img_url    | https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg                                                                                                                                  
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 arti

In [52]:
# 1.2 regax replace (filtering)
# example <a href="url">link text</a> --> ""
# <a href="https://www.w3schools.com/">Visit W3Schools.com!</a>

############### Case I #############################
data = [
    (14431,'<p>今天要帶大家去吃吃台中有名的這一鍋皇室秘藏鍋物，據說會有著「帝王般」的享受（是有沒有這麼誇張），話說這一鍋北中南都有分店，但是不知道是從哪裡起家的？我們這次吃的是「台中朝富店」喔</p>\n\n<p>地點：<span style="-webkit-text-stroke-width:0px; background-color:rgba(0, 0, 0, 0.03); color:rgba(0, 0, 0, 0.87); display:inline !important; float:none; font-family:roboto,noto sans tc,arial,sans-serif; font-size:13px; font-style:normal; font-variant-caps:normal; font-variant-ligatures:normal; font-weight:400; letter-spacing:normal; orphans:2; text-align:left; text-decoration-color:initial; text-decoration-style:initial; text-indent:0px; text-transform:none; white-space:normal; widows:2; word-spacing:0px">台中市西屯區朝富路36號</span></p>\n\n<p>電話：<span style="-webkit-text-stroke-width:0px; background-color:rgba(0, 0, 0, 0.03); color:rgba(0, 0, 0, 0.87); display:inline !important; float:none; font-family:roboto,noto sans tc,arial,sans-serif; font-size:13px; font-style:normal; font-variant-caps:normal; font-variant-ligatures:normal; font-weight:400; letter-spacing:normal; orphans:2; text-align:left; text-decoration-color:initial; text-decoration-style:initial; text-indent:0px; text-transform:none; white-space:normal; widows:2; word-spacing:0px">04 3609 0088</span></p>\n\n<p>營業時間：11:30~03:00</p>\n\n<p><iframe class="" frameborder="0" height="350" marginheight="0" marginwidth="0" scrolling="no" src="//maps.google.com/?ie=UTF8&amp;f=q&amp;source=s_q&amp;q=loc:24.166896,+120.63769000000002+(%E5%8F%B0%E7%81%A3%E5%8F%B0%E4%B8%AD%E5%B8%82%E8%A5%BF%E5%B1%AF%E5%8D%80%E6%9C%9D%E5%AF%8C%E8%B7%AF%E9%80%99%E4%B8%80%E9%8D%8B%E5%8F%B0%E4%B8%AD%E6%9C%9D%E5%AF%8C%E6%AE%BF)&amp;sll=24.160044,120.62857&amp;ll=24.166896,120.63769&amp;marker=24.166896,120.63769&amp;mrt=loc&amp;z=15&amp;t=m&amp;output=embed" width="425"></iframe><br>\n<small><a href="//maps.google.com/?ie=UTF8&amp;f=q&amp;source=s_q&amp;q=loc:24.166896,+120.63769000000002+(%E5%8F%B0%E7%81%A3%E5%8F%B0%E4%B8%AD%E5%B8%82%E8%A5%BF%E5%B1%AF%E5%8D%80%E6%9C%9D%E5%AF%8C%E8%B7%AF%E9%80%99%E4%B8%80%E9%8D%8B%E5%8F%B0%E4%B8%AD%E6%9C%9D%E5%AF%8C%E6%AE%BF)&amp;sll=24.160044,120.62857&amp;ll=24.166896,120.63769&amp;marker=24.166896,120.63769&amp;mrt=loc&amp;z=15&amp;t=m&amp;source=embed" style="color:#0000FF;text-align:left">檢視較大的地圖</a></small></p>\n\n<p><strong><span style="font-size:14px"><a href="https://www.instagram.com/p/BhtcYEmhi9U/" target="_blank"><img alt="" height="20" src="//s.pixfs.net/f.pixnet.net/images/emotions/032.gif" title="" width="20">想看IG介紹請點我<img alt="" height="20" src="//s.pixfs.net/f.pixnet.net/images/emotions/019.gif" title="" width="20"></a></span></strong></p>\n\n<p><small>朝富店的外觀非常壯闊氣派，不禁讓人點點頭「嗯，果然是台中的餐廳，建築物都要超。大」</small></p>\n\n<p><img alt="IMG_5208.jpg" src="https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg" title="IMG_5208.jpg"></p>\n\n<p>進去唄～</p>\n\n<p><img alt="IMG_5217.jpg" src="https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg" title="IMG_5217.jpg"></p>\n\n<p>一樓有個大水池，看起來很富麗堂皇</p>\n\n<p><img alt="IMG_5232.jpg" src="https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg" title="IMG_5232.jpg"></p>\n\n<p><a href="https://vbtrax.com/track/clicks/1006/ce2bc2bd9f093698a587f264d175924223674fdf2aabebf50760b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5="><img alt="" border="0" src="https://vbtrax.com/track/imp/img/368/ce2bc2bd9f093698a587f264d175924223674fdf2aabebf50760b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5=" title=""></a></p>\n\n<p>最讓我讚嘆的就是這個牡丹（？中國風的樓梯了！！好美喔～～</p>\n\n<p><img alt="IMG_5230.jpg" src="https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg" title="IMG_5230.jpg"></p>\n\n<p>整棟建築物看起來用餐環境很棒</p>\n\n<p><img alt="IMG_5229.jpg" src="https://pic.pimg.tw/happy78/1528543962-1785144547_n.jpg" title="IMG_5229.jpg"></p>\n\n<p>我看到一二樓都有這些宮廷風服飾，我沒問服務生是不是可以穿（因為我也不想穿XD而且整家店我也沒看到有人在穿XD）不過如果你想穿的話我想應該是沒人會阻止</p>\n\n<p><img alt="IMG_5228.jpg" src="https://pic.pimg.tw/happy78/1528543960-3197815885_n.jpg" title="IMG_5228.jpg"></p>\n\n<p>醬料也有滿多選擇的</p>\n\n<p><img alt="IMG_5216.jpg" src="https://pic.pimg.tw/happy78/1528543947-2503982521_n.jpg" title="IMG_5216.jpg"></p>\n\n<p><a href="https://vbtrax.com/track/clicks/1096/ce2bc2b79e073698a587f264d175924223674fdf2aabebf50769b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5="><img alt="" border="0" src="https://vbtrax.com/track/imp/img/976/ce2bc2b79e073698a587f264d175924223674fdf2aabebf50769b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5=" title=""></a></p>\n\n<p>我們點了麻辣鍋跟老火湯的鴛鴦鍋</p>\n\n<p><img alt="IMG_5221.jpg" src="https://pic.pimg.tw/happy78/1528543950-1832601825_n.jpg" title="IMG_5221.jpg"></p>\n\n<p>因為我們沒有很餓，所以沒有點很多，以下就一一介紹：（對了，這次忘記拍菜單，想看菜單的人可以去它<a href="http://www.toponepot.com/product_detail.php" target="_blank"><strong>官網</strong></a>看喔）</p>\n\n<p>這是梅花豬268$</p>\n\n<p><img alt="IMG_5219.jpg" src="https://pic.pimg.tw/happy78/1528543949-4214686345_n.jpg" title="IMG_5219.jpg"></p>\n\n<p>和牛梅花598$</p>\n\n<p><img alt="IMG_5222.jpg" src="https://pic.pimg.tw/happy78/1528543955-3093638684_n.jpg" title="IMG_5222.jpg"></p>\n\n<p>因為跟長輩吃比較不好意思慢慢拍，所以照片沒有很多，照片裡還有玉米筍跟盛味綜合丸238$，另外右上角是店家招待的香蔥油條，我覺得滿特別的，乾吃的話會很有嚼勁，泡湯吃也很美味</p>\n\n<p>&nbsp;</p>\n\n<p><img alt="IMG_5224.jpg" src="https://pic.pimg.tw/happy78/1528543955-714147528_n.jpg" title="IMG_5224.jpg"></p>\n\n<p><a href="https://vbtrax.com/track/clicks/3456/ce2bc2bd9c0123daefcda67f8835ce1328684bc177fbb9b20a63b60061?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5="><img alt="" border="0" src="https://vbtrax.com/track/imp/img/35035/ce2bc2bd9c0123daefcda67f8835ce1328684bc177fbb9b20a63b60061?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5=" title=""></a></p>\n\n<p>要營養均衡，所以當然要吃青菜啦，這是五彩鮮蔬盤'),
    (55444,'<pstyle="text-align:left;"><imgsrc="https://pic.pimg.tw/peko721/1555405696-3356669416.jpg"alt="麥食達韓式料理.麥食達菜單.麥食達.台北車站美食.北車美食.石鍋拌飯."/></p><pstyle="text-align:left;"><strong><spanstyle="color:#0000ff;font-size:14pt;">麥食達</span></strong></p><pstyle="text-align:left;">地址：台北市中正區懷寧街86號</p><pstyle="text-align:left;">這裡不是裝潢華麗的韓式料理餐廳，而是一家有點像家庭食堂的小店，紅蔘茶、冬粉、味噌湯都是無限量供應。</p><pstyle="text-align:left;">相關文章：<atitle="【台北車站美食】麥食達韓式料理｜228公園旁的平價石鍋拌飯、鍋物｜附麥食達菜單"href="https://peko721.pixnet.net/blog/post/46669347"target="_blank">麥食達韓式料理｜228公園旁的平價石鍋拌飯、鍋物</a></p><divstyle="text-align:center;">')
    
]

columns = ['article_id','raw_content']


df = spark.createDataFrame(data, columns)


df_filted = (
    df.withColumn("raw_content", F.regexp_replace(C('raw_content'),
#                                               r'(<a href.*</a>)|(<atitle.*href.*</a>)',
                                              r'(<.*href.*</a>)',
                                              ''))
)

df_filted.show(n=20, vertical=True, truncate=False)
############### Case II ###########################


with open("../data/webpage_1.txt", "r") as f:
    text = [f.read()]

schema = StructType(
    [
    StructField("article_id", IntegerType(), True),
    StructField("raw_content", StringType(), True)
    ]
)
# data format
# [(r1_col1, r1_col2, ...),
#  (r2_col1, r2_col2, ...),
# ]
df = spark.createDataFrame([(0, text)], schema=schema)
# df.show(truncate=False)

df_filted = (
    df.withColumn("raw_content", F.regexp_replace(C('raw_content'),
#                                               r'(<a href.*</a>)|(<atitle.*href.*</a>)',
                                              r'(<.*href.*</a>)',
                                              ''))
)

df_filted.show(n=20, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [53]:
# 2. regax filtering
# contains
# startswith
# endwith

data = [
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg mfkvmdfasx'),
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,0.97611,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,0.93422,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,0.94231,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg'),
    (67789,0.94111,'https://pic.pimg.tw/happy78/1528543962-45890_n.png'),
    (67789,0.94111,'png')
]

columns = ['article_id','pred','img_url']


df = spark.createDataFrame(data, columns)

df_con = (
    df.filter(df.img_url.contains('mfkv'))
)
df_con.show(vertical=True,truncate=False)

df_startwith = (
    df.filter(df.img_url.startswith("png"))
)

df_con.show(vertical=True,truncate=False)

df_endwith = (
    df.filter(df.img_url.endswith("png"))
)

df_endwith.show(vertical=True,truncate=False)

-RECORD 0-----------------------------------------------------------------------
 article_id | 14431                                                             
 pred       | 0.99834                                                           
 img_url    | https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg mfkvmdfasx 

-RECORD 0-----------------------------------------------------------------------
 article_id | 14431                                                             
 pred       | 0.99834                                                           
 img_url    | https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg mfkvmdfasx 

-RECORD 0--------------------------------------------------------
 article_id | 67789                                              
 pred       | 0.94111                                            
 img_url    | https://pic.pimg.tw/happy78/1528543962-45890_n.png 
-RECORD 1--------------------------------------------------------
 article_id | 67789 

In [54]:
# html - related, separate each part of html tag


data = [
    (14431,'<p>今天要帶大家去吃吃台中有名的這一鍋皇室秘藏鍋物，據說會有著「帝王般」的享受（是有沒有這麼誇張），話說這一鍋北中南都有分店，但是不知道是從哪裡起家的？我們這次吃的是「台中朝富店」喔</p>\n\n<p>地點：<span style="-webkit-text-stroke-width:0px; background-color:rgba(0, 0, 0, 0.03); color:rgba(0, 0, 0, 0.87); display:inline !important; float:none; font-family:roboto,noto sans tc,arial,sans-serif; font-size:13px; font-style:normal; font-variant-caps:normal; font-variant-ligatures:normal; font-weight:400; letter-spacing:normal; orphans:2; text-align:left; text-decoration-color:initial; text-decoration-style:initial; text-indent:0px; text-transform:none; white-space:normal; widows:2; word-spacing:0px">台中市西屯區朝富路36號</span></p>\n\n<p>電話：<span style="-webkit-text-stroke-width:0px; background-color:rgba(0, 0, 0, 0.03); color:rgba(0, 0, 0, 0.87); display:inline !important; float:none; font-family:roboto,noto sans tc,arial,sans-serif; font-size:13px; font-style:normal; font-variant-caps:normal; font-variant-ligatures:normal; font-weight:400; letter-spacing:normal; orphans:2; text-align:left; text-decoration-color:initial; text-decoration-style:initial; text-indent:0px; text-transform:none; white-space:normal; widows:2; word-spacing:0px">04 3609 0088</span></p>\n\n<p>營業時間：11:30~03:00</p>\n\n<p><iframe class="" frameborder="0" height="350" marginheight="0" marginwidth="0" scrolling="no" src="//maps.google.com/?ie=UTF8&amp;f=q&amp;source=s_q&amp;q=loc:24.166896,+120.63769000000002+(%E5%8F%B0%E7%81%A3%E5%8F%B0%E4%B8%AD%E5%B8%82%E8%A5%BF%E5%B1%AF%E5%8D%80%E6%9C%9D%E5%AF%8C%E8%B7%AF%E9%80%99%E4%B8%80%E9%8D%8B%E5%8F%B0%E4%B8%AD%E6%9C%9D%E5%AF%8C%E6%AE%BF)&amp;sll=24.160044,120.62857&amp;ll=24.166896,120.63769&amp;marker=24.166896,120.63769&amp;mrt=loc&amp;z=15&amp;t=m&amp;output=embed" width="425"></iframe><br>\n<small><a href="//maps.google.com/?ie=UTF8&amp;f=q&amp;source=s_q&amp;q=loc:24.166896,+120.63769000000002+(%E5%8F%B0%E7%81%A3%E5%8F%B0%E4%B8%AD%E5%B8%82%E8%A5%BF%E5%B1%AF%E5%8D%80%E6%9C%9D%E5%AF%8C%E8%B7%AF%E9%80%99%E4%B8%80%E9%8D%8B%E5%8F%B0%E4%B8%AD%E6%9C%9D%E5%AF%8C%E6%AE%BF)&amp;sll=24.160044,120.62857&amp;ll=24.166896,120.63769&amp;marker=24.166896,120.63769&amp;mrt=loc&amp;z=15&amp;t=m&amp;source=embed" style="color:#0000FF;text-align:left">檢視較大的地圖</a></small></p>\n\n<p><strong><span style="font-size:14px"><a href="https://www.instagram.com/p/BhtcYEmhi9U/" target="_blank"><img alt="" height="20" src="//s.pixfs.net/f.pixnet.net/images/emotions/032.gif" title="" width="20">想看IG介紹請點我<img alt="" height="20" src="//s.pixfs.net/f.pixnet.net/images/emotions/019.gif" title="" width="20"></a></span></strong></p>\n\n<p><small>朝富店的外觀非常壯闊氣派，不禁讓人點點頭「嗯，果然是台中的餐廳，建築物都要超。大」</small></p>\n\n<p><img alt="IMG_5208.jpg" src="https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg" title="IMG_5208.jpg"></p>\n\n<p>進去唄～</p>\n\n<p><img alt="IMG_5217.jpg" src="https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg" title="IMG_5217.jpg"></p>\n\n<p>一樓有個大水池，看起來很富麗堂皇</p>\n\n<p><img alt="IMG_5232.jpg" src="https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg" title="IMG_5232.jpg"></p>\n\n<p><a href="https://vbtrax.com/track/clicks/1006/ce2bc2bd9f093698a587f264d175924223674fdf2aabebf50760b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5="><img alt="" border="0" src="https://vbtrax.com/track/imp/img/368/ce2bc2bd9f093698a587f264d175924223674fdf2aabebf50760b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5=" title=""></a></p>\n\n<p>最讓我讚嘆的就是這個牡丹（？中國風的樓梯了！！好美喔～～</p>\n\n<p><img alt="IMG_5230.jpg" src="https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg" title="IMG_5230.jpg"></p>\n\n<p>整棟建築物看起來用餐環境很棒</p>\n\n<p><img alt="IMG_5229.jpg" src="https://pic.pimg.tw/happy78/1528543962-1785144547_n.jpg" title="IMG_5229.jpg"></p>\n\n<p>我看到一二樓都有這些宮廷風服飾，我沒問服務生是不是可以穿（因為我也不想穿XD而且整家店我也沒看到有人在穿XD）不過如果你想穿的話我想應該是沒人會阻止</p>\n\n<p><img alt="IMG_5228.jpg" src="https://pic.pimg.tw/happy78/1528543960-3197815885_n.jpg" title="IMG_5228.jpg"></p>\n\n<p>醬料也有滿多選擇的</p>\n\n<p><img alt="IMG_5216.jpg" src="https://pic.pimg.tw/happy78/1528543947-2503982521_n.jpg" title="IMG_5216.jpg"></p>\n\n<p><a href="https://vbtrax.com/track/clicks/1096/ce2bc2b79e073698a587f264d175924223674fdf2aabebf50769b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5="><img alt="" border="0" src="https://vbtrax.com/track/imp/img/976/ce2bc2b79e073698a587f264d175924223674fdf2aabebf50769b4?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5=" title=""></a></p>\n\n<p>我們點了麻辣鍋跟老火湯的鴛鴦鍋</p>\n\n<p><img alt="IMG_5221.jpg" src="https://pic.pimg.tw/happy78/1528543950-1832601825_n.jpg" title="IMG_5221.jpg"></p>\n\n<p>因為我們沒有很餓，所以沒有點很多，以下就一一介紹：（對了，這次忘記拍菜單，想看菜單的人可以去它<a href="http://www.toponepot.com/product_detail.php" target="_blank"><strong>官網</strong></a>看喔）</p>\n\n<p>這是梅花豬268$</p>\n\n<p><img alt="IMG_5219.jpg" src="https://pic.pimg.tw/happy78/1528543949-4214686345_n.jpg" title="IMG_5219.jpg"></p>\n\n<p>和牛梅花598$</p>\n\n<p><img alt="IMG_5222.jpg" src="https://pic.pimg.tw/happy78/1528543955-3093638684_n.jpg" title="IMG_5222.jpg"></p>\n\n<p>因為跟長輩吃比較不好意思慢慢拍，所以照片沒有很多，照片裡還有玉米筍跟盛味綜合丸238$，另外右上角是店家招待的香蔥油條，我覺得滿特別的，乾吃的話會很有嚼勁，泡湯吃也很美味</p>\n\n<p>&nbsp;</p>\n\n<p><img alt="IMG_5224.jpg" src="https://pic.pimg.tw/happy78/1528543955-714147528_n.jpg" title="IMG_5224.jpg"></p>\n\n<p><a href="https://vbtrax.com/track/clicks/3456/ce2bc2bd9c0123daefcda67f8835ce1328684bc177fbb9b20a63b60061?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5="><img alt="" border="0" src="https://vbtrax.com/track/imp/img/35035/ce2bc2bd9c0123daefcda67f8835ce1328684bc177fbb9b20a63b60061?subid_1=&amp;subid_2=&amp;subid_3=&amp;subid_4=&amp;subid_5=" title=""></a></p>\n\n<p>要營養均衡，所以當然要吃青菜啦，這是五彩鮮蔬盤'),
    (55444,'<pstyle="text-align:left;"><imgsrc="https://pic.pimg.tw/peko721/1555405696-3356669416.jpg"alt="麥食達韓式料理.麥食達菜單.麥食達.台北車站美食.北車美食.石鍋拌飯."/></p><pstyle="text-align:left;"><strong><spanstyle="color:#0000ff;font-size:14pt;">麥食達</span></strong></p><pstyle="text-align:left;">地址：台北市中正區懷寧街86號</p><pstyle="text-align:left;">這裡不是裝潢華麗的韓式料理餐廳，而是一家有點像家庭食堂的小店，紅蔘茶、冬粉、味噌湯都是無限量供應。</p><pstyle="text-align:left;">相關文章：<atitle="【台北車站美食】麥食達韓式料理｜228公園旁的平價石鍋拌飯、鍋物｜附麥食達菜單"href="https://peko721.pixnet.net/blog/post/46669347"target="_blank">麥食達韓式料理｜228公園旁的平價石鍋拌飯、鍋物</a></p><divstyle="text-align:center;">'),
    (666,'<html><head></head><body><h1>This is python</h1></body></html>')
]

columns = ['article_id','raw_content']


df = spark.createDataFrame(data, columns)



df_filted = (
    df
    .withColumn("cleaned",
                  F.explode_outer(
                      F.split(C("raw_content"),r'<[^>]*>|\p{Z}+|\s+'))
                 )
    .withColumn("cleaned", F.regexp_replace(C("cleaned"), r"\p{Z}*", ""))
    .withColumn("cleaned",F.trim(C("cleaned")))
    .withColumn("length", F.length(C("cleaned")))
    .where(F.length(C("cleaned")) > 0)
    .drop('raw_content')
)

df_filted.show(n=200,vertical=True, truncate=False)

-RECORD 0-----------------------------------------------------------------------------------------------
 article_id | 14431                                                                                     
 cleaned    | 今天要帶大家去吃吃台中有名的這一鍋皇室秘藏鍋物，據說會有著「帝王般」的享受（是有沒有這麼誇張），話說這一鍋北中南都有分店，但是不知道是從哪裡起家的？我們這次吃的是「台中朝富店」喔 
 length     | 89                                                                                        
-RECORD 1-----------------------------------------------------------------------------------------------
 article_id | 14431                                                                                     
 cleaned    | 地點：                                                                                       
 length     | 3                                                                                         
-RECORD 2-----------------------------------------------------------------------------------------------
 article_id | 14431                                    

In [55]:
# 3 reg-like
# Search all the pattern mahor_use in raw_content


columns = ["name","languagesAtSchool","major_use","raw_content"]
data = [
    ("James,,Smith",["Java","Scala","C++"],"Java","sdmcsmcmpkla,pxozas,pxmasmxpJavasodmcasmdcpso"), \
    ("Michael,Rose,",["Spark","Java","C++"],"Python","sd,mcmsdcopmsocmpsmdcpython"), \
    ("Robert,,Williams",["CSharp","VB"],"","smdkcadpmcpowmcpmspdcmpsdc"),
    ("ABC,ss,Williams",[],"Scala","smdoicmsocomamsodmcosdcoms"),
    ("ABC,ss,Williams",[],"Scala","smdoicmsocomamsodmcosdcomsSca")
       ]


df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

(
    df
    .withColumn("matched",C("raw_content").contains(C("major_use")))
).show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- major_use: string (nullable = true)
 |-- raw_content: string (nullable = true)

+----------------+------------------+---------+---------------------------------------------+
|name            |languagesAtSchool |major_use|raw_content                                  |
+----------------+------------------+---------+---------------------------------------------+
|James,,Smith    |[Java, Scala, C++]|Java     |sdmcsmcmpkla,pxozas,pxmasmxpJavasodmcasmdcpso|
|Michael,Rose,   |[Spark, Java, C++]|Python   |sd,mcmsdcopmsocmpsmdcpython                  |
|Robert,,Williams|[CSharp, VB]      |         |smdkcadpmcpowmcpmspdcmpsdc                   |
|ABC,ss,Williams |[]                |Scala    |smdoicmsocomamsodmcosdcoms                   |
|ABC,ss,Williams |[]                |Scala    |smdoicmsocomamsodmcosdcomsSca                |
+----------------+--------

## filtering in complex type (1+)

In [56]:
# 1. filter in array type
columns = ["name","languagesAtSchool","currentState"]
data = [("James,,Smith",["Java","Scala","C++"],"CA"), \
    ("Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ("Robert,,Williams",["CSharp","VB"],"NV")]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

######### Case 1, single value #############

df = (
    df.withColumn("filter_languagesAtSchool",F.array_contains(C("languagesAtSchool"), "Java"))
)
df.show(truncate=False)


######### Case 2, multiple values ?
 
# must_language = ["Spark","Java"]
# must_language_lit = [F.lit(i) for i in must_language]
# print(must_language_lit)
# df = (
#     df.withColumn("filter_languagesAtSchool",F.array_contains(C("languagesAtSchool"), must_language_lit))
# )
# df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+----------------+------------------+------------+
|name            |languagesAtSchool |currentState|
+----------------+------------------+------------+
|James,,Smith    |[Java, Scala, C++]|CA          |
|Michael,Rose,   |[Spark, Java, C++]|NJ          |
|Robert,,Williams|[CSharp, VB]      |NV          |
+----------------+------------------+------------+

+----------------+------------------+------------+------------------------+
|name            |languagesAtSchool |currentState|filter_languagesAtSchool|
+----------------+------------------+------------+------------------------+
|James,,Smith    |[Java, Scala, C++]|CA          |true                    |
|Michael,Rose,   |[Spark, Java, C++]|NJ          |true                    |
|Robert,,Williams|[CSharp, VB]      |NV          |false                   |
+---

In [57]:

# 2 column A array contains column B
# https://stackoverflow.com/questions/48488463/use-is-in-between-2-spark-dataframe-columns

columns = ["name","languagesAtSchool","major_use"]
data = [("James,,Smith",["Java","Scala","C++"],"Java"), \
    ("Michael,Rose,",["Spark","Java","C++"],"Python"), \
    ("Robert,,Williams",["CSharp","VB"],""),
    ("ABC,ss,Williams",[],"Scala")]


df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

######### Case 1, single value #############

df = (
    df
    .withColumn("is_major_been_taught",F.expr("array_contains(languagesAtSchool, major_use)").cast("integer"))
#     .withColumn("is_major_been_taught",F.expr("array_contains(languagesAtSchool, major_use)"))
)
df.show(truncate=False)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- major_use: string (nullable = true)

+----------------+------------------+---------+
|name            |languagesAtSchool |major_use|
+----------------+------------------+---------+
|James,,Smith    |[Java, Scala, C++]|Java     |
|Michael,Rose,   |[Spark, Java, C++]|Python   |
|Robert,,Williams|[CSharp, VB]      |         |
|ABC,ss,Williams |[]                |Scala    |
+----------------+------------------+---------+

+----------------+------------------+---------+--------------------+
|name            |languagesAtSchool |major_use|is_major_been_taught|
+----------------+------------------+---------+--------------------+
|James,,Smith    |[Java, Scala, C++]|Java     |1                   |
|Michael,Rose,   |[Spark, Java, C++]|Python   |0                   |
|Robert,,Williams|[CSharp, VB]      |         |0                   |
|ABC,ss,Williams |[]  

# JOIN (3+)

In [58]:
# 1 left, right, inner, outer, cross
# cross join ( cartesian product with another DataFrame )
# cross join usually use high computational cost which we should avoid to use it 
# https://spark.apache.org/docs/2.3.0/api/python/pyspark.sql.html#pyspark.sql.GroupedData.apply


# https://www.youtube.com/watch?v=fp53QhSfQcI
# check the viodeo 05:55 ~ 9.08
# The shuffle hash join works best when
# distribute evenly with the key we are joining on
# have an adequate number of keys for parallesim
# The problem often happens when the table you wanna join is unevenly distribute (8 : 00)
left = spark.createDataFrame(
    [
        ("A", 1),
        ("B", 2),
        ("C", 3)
    ],
    ("col1","co12")
)

right = spark.createDataFrame(
    [
        ("A", 20),
        ("Y", 30),
        ("Z", 50)
    ],
    ("col1","co12")
)

left.show()
right.show()

print('left join')
left.join(right,on='col1',how='left').show()
print('right join')
left.join(right,on='col1',how='right').show()
print('inner join')
left.join(right,on='col1',how='inner').show()
print('outer join')
left.join(right,on='col1',how='outer').show()
print('cross')
left.crossJoin(right.select('col1')).show()

+----+----+
|col1|co12|
+----+----+
|   A|   1|
|   B|   2|
|   C|   3|
+----+----+

+----+----+
|col1|co12|
+----+----+
|   A|  20|
|   Y|  30|
|   Z|  50|
+----+----+

left join
+----+----+----+
|col1|co12|co12|
+----+----+----+
|   B|   2|null|
|   C|   3|null|
|   A|   1|  20|
+----+----+----+

right join
+----+----+----+
|col1|co12|co12|
+----+----+----+
|   Y|null|  30|
|   Z|null|  50|
|   A|   1|  20|
+----+----+----+

inner join
+----+----+----+
|col1|co12|co12|
+----+----+----+
|   A|   1|  20|
+----+----+----+

outer join
+----+----+----+
|col1|co12|co12|
+----+----+----+
|   B|   2|null|
|   Y|null|  30|
|   C|   3|null|
|   Z|null|  50|
|   A|   1|  20|
+----+----+----+

cross
+----+----+----+
|col1|co12|col1|
+----+----+----+
|   A|   1|   A|
|   A|   1|   Y|
|   A|   1|   Z|
|   B|   2|   A|
|   C|   3|   A|
|   B|   2|   Y|
|   B|   2|   Z|
|   C|   3|   Y|
|   C|   3|   Z|
+----+----+----+



In [59]:
# broadcast join
# https://www.youtube.com/watch?v=fp53QhSfQcI
# if we perform broadcast join 14:50 (no shuffling)
# if we DOESNT perform broad join 05:55(shuffling a lot)

data = [
    (14431,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','img_url']

df_big = spark.createDataFrame(data=data, schema=columns)
df_big.show(n=5)

df_small = df_big.limit(1)
df_small.show(n=5)

# NOTE, broadcast join support only left join
df_joined = df_big.join(F.broadcast(df_small), on=['img_url'],how='left')
df_joined.explain()
df_joined.show()



+----------+--------------------+
|article_id|             img_url|
+----------+--------------------+
|     14431|https://pic.pimg....|
|     14431|https://pic.pimg....|
|     14431|https://pic.pimg....|
|     67789|https://pic.pimg....|
|     67789|https://pic.pimg....|
+----------+--------------------+

+----------+--------------------+
|article_id|             img_url|
+----------+--------------------+
|     14431|https://pic.pimg....|
+----------+--------------------+

== Physical Plan ==
*(3) Project [img_url#1230, article_id#1229L, article_id#1251L]
+- *(3) BroadcastHashJoin [img_url#1230], [img_url#1252], LeftOuter, BuildRight
   :- Scan ExistingRDD[article_id#1229L,img_url#1230]
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[1, string, true]))
      +- *(2) GlobalLimit 1
         +- Exchange SinglePartition
            +- *(1) LocalLimit 1
               +- Scan ExistingRDD[article_id#1251L,img_url#1252]
+--------------------+----------+----------+
|            

In [60]:
# left_anti and ledt_semi
# https://dzone.com/articles/pyspark-join-explained-with-examples


data = [
    (14431,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','img_url']

df_big = spark.createDataFrame(data=data, schema=columns)
df_big.show(n=5)

df_small = df_big.limit(1)
df_small.show(n=5)

print('left_semi : inner join but return only left dataframe column')

df_left_semi = df_big.join(df_small,on=['img_url'],how='left_semi')
df_left_semi.show()

print('left_anti : difference left - right but return only left dataframe column')

df_left_anti = df_big.join(df_small,on=['img_url'],how='left_anti')
df_left_anti.show()

+----------+--------------------+
|article_id|             img_url|
+----------+--------------------+
|     14431|https://pic.pimg....|
|     14431|https://pic.pimg....|
|     14431|https://pic.pimg....|
|     67789|https://pic.pimg....|
|     67789|https://pic.pimg....|
+----------+--------------------+

+----------+--------------------+
|article_id|             img_url|
+----------+--------------------+
|     14431|https://pic.pimg....|
+----------+--------------------+

left_semi : inner join but return only left dataframe column
+--------------------+----------+
|             img_url|article_id|
+--------------------+----------+
|https://pic.pimg....|     14431|
+--------------------+----------+

left_anti : difference left - right but return only left dataframe column
+--------------------+----------+
|             img_url|article_id|
+--------------------+----------+
|https://pic.pimg....|     14431|
|https://pic.pimg....|     14431|
|https://pic.pimg....|     67789|
|https://pic

# Aggregation Operations

In [61]:
# 2 broadcast join
# boardcast join for big dataframe and small data frame join
# broadcast small dataframe to each worker,
# them the excute plan make narrow dependency instead of wide dependency
# code - https://stackoverflow.com/questions/37487318/spark-sql-broadcast-hash-join
# documentation - search broadcast join in doc https://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html
# concept https://www.youtube.com/watch?v=fp53QhSfQcI 14:32 ~ 14:59
data = [
    ("hotpop","Meat",3,"https//:123.png","rtyg11"),
    ("hotpop","Meat",3,"https//:456.png","rtyg11"),
    ("hotpop","Meat",3,"https//:789.png","rtyg11"),
    ("hotpop","Vegetable",2,"https//:111.png","rtyg11"),
    ("hotpop","Vegetable",2,"https//:222.png","rtyg11"),
    ("branch","Fried food",1,"https//:333.png","bvc1"),
    ("branch","Dessert",1,"https//:444.png","7854"),
  ]

columns = ["store_name","food_category","food_category_popularity","img_url","author_id"]
df_big = spark.createDataFrame(data = data, schema = columns)

df_small = df_big.limit(1)

df_small.show()

df_broadcast_join = (
 df_big
    .join(F.broadcast(df_small), on="img_url")
)

df_broadcast_join.explain()

df_broadcast_join.toPandas()

+----------+-------------+------------------------+---------------+---------+
|store_name|food_category|food_category_popularity|        img_url|author_id|
+----------+-------------+------------------------+---------------+---------+
|    hotpop|         Meat|                       3|https//:123.png|   rtyg11|
+----------+-------------+------------------------+---------------+---------+

== Physical Plan ==
*(3) Project [img_url#1315, store_name#1312, food_category#1313, food_category_popularity#1314L, author_id#1316, store_name#1341, food_category#1342, food_category_popularity#1343L, author_id#1345]
+- *(3) BroadcastHashJoin [img_url#1315], [img_url#1344], Inner, BuildRight
   :- *(3) Filter isnotnull(img_url#1315)
   :  +- Scan ExistingRDD[store_name#1312,food_category#1313,food_category_popularity#1314L,img_url#1315,author_id#1316]
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[3, string, false]))
      +- *(2) Filter isnotnull(img_url#1344)
         +- *(2) GlobalL

Unnamed: 0,img_url,store_name,food_category,food_category_popularity,author_id,store_name.1,food_category.1,food_category_popularity.1,author_id.1
0,https//:123.png,hotpop,Meat,3,rtyg11,hotpop,Meat,3,rtyg11


In [62]:
# 3 left semi, left anti
# right semi, right anti
# https://dzone.com/articles/pyspark-join-explained-with-examples

left = spark.createDataFrame(
    [
        ("A", 1),
        ("B", 2),
        ("C", 3)
    ],
    ("col1","co12")
)

right = spark.createDataFrame(
    [
        ("A", 20),
        ("Y", 30),
        ("Z", 50)
    ],
    ("col1","co12")
)

left.show()
right.show()

print('inner - inner join and keeps columns with two tables')
print('which will be annoying if there are same column name or dropping column by yourself')

left.join(right, on='col1',how='inner').show()

print('left_semi - inner join but only keeps left table columns')

left.join(right, on='col1',how='left_semi').show()

print('left_anti - selects all rows from left that are not present in right.')

left.join(right, on='col1',how='left_anti').show()

+----+----+
|col1|co12|
+----+----+
|   A|   1|
|   B|   2|
|   C|   3|
+----+----+

+----+----+
|col1|co12|
+----+----+
|   A|  20|
|   Y|  30|
|   Z|  50|
+----+----+

inner - inner join and keeps columns with two tables
which will be annoying if there are same column name or dropping column by yourself
+----+----+----+
|col1|co12|co12|
+----+----+----+
|   A|   1|  20|
+----+----+----+

left_semi - inner join but only keeps left table columns
+----+----+
|col1|co12|
+----+----+
|   A|   1|
+----+----+

left_anti - selects all rows from left that are not present in right.
+----+----+
|col1|co12|
+----+----+
|   B|   2|
|   C|   3|
+----+----+



# Aggregation (12+)

In [63]:
# 1 knowing the groupby object method

data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("apartment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp = df.groupBy("salary")

print(type(df_grp), dir(df_grp), sep='\n\n')

# avg, count, max, mean, sum  - Common aggregation
# pivot - two column x, y with value in the table
# sql_ctx - apply sql command
# custom function - agg, apply


<class 'pyspark.sql.group.GroupedData'>

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_df', '_jgd', 'agg', 'apply', 'avg', 'count', 'max', 'mean', 'min', 'pivot', 'sql_ctx', 'sum']


In [64]:
# 2 apply single aggregation fuction on groupby object
data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = (
    df.groupby("deparment").mean("salary").alias("mean_salary")
)

df_grp_department.show(n=5)

+---------+-----------+
|deparment|avg(salary)|
+---------+-----------+
|        F|       -1.0|
|       RD|     3500.0|
|      SRE|     4000.0|
+---------+-----------+



In [65]:
# 3 apply multiple aggregation fuction on groupby object
data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = df.groupby("deparment").agg(
    F.sum("salary").alias("sum_salary"),
    F.avg("salary").alias("avg_salary"),
    F.max("salary").alias("max_salary"),
    F.min("salary").alias("min_salary"),
    F.count("salary").alias("group_size")
)

df_grp_department.show(n=5)

+---------+----------+----------+----------+----------+----------+
|deparment|sum_salary|avg_salary|max_salary|min_salary|group_size|
+---------+----------+----------+----------+----------+----------+
|        F|        -1|      -1.0|        -1|        -1|         1|
|       RD|      7000|    3500.0|      4000|      3000|         2|
|      SRE|      8000|    4000.0|      4000|      4000|         2|
+---------+----------+----------+----------+----------+----------+



In [66]:
# 4 collect data point for each group with the stats(min, max, sum, avg, count)


# apply multiple aggregation fuction on groupby object
data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = df.groupby("deparment").agg(
    F.sum("salary").alias("sum_salary"),
    F.avg("salary").alias("avg_salary"),
    F.max("salary").alias("max_salary"),
    F.min("salary").alias("min_salary"),
    F.count("salary").alias("count_rows"),
    F.collect_list("salary").alias("all_rows")
)

df_grp_department.toPandas()


Unnamed: 0,deparment,sum_salary,avg_salary,max_salary,min_salary,count_rows,all_rows
0,F,-1,-1.0,-1,-1,1,[-1]
1,RD,7000,3500.0,4000,3000,2,"[3000, 4000]"
2,SRE,8000,4000.0,4000,4000,2,"[4000, 4000]"


In [67]:
# 5 get first one row in each group
# We use Window Function here
# Key to think about this, we rank the data in each group, then 
# filtering
# no nothing is groupby
# which is different in pandas

data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 8000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 6000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])


df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.withColumn("rank_salary_by_deparment",
                  F.row_number().over(
                  W.partitionBy("deparment")\
                      .orderBy(F.desc("salary"))
                  )
                 )\
    .filter(F.col("rank_salary_by_deparment") == 1)\
    .drop('rank_salary_by_deparment')
)

df.show(n=5)


+---------+----------+--------+-----+---------+-------+------+
|firstname|middlename|lastname|   id|deparment| gender|salary|
+---------+----------+--------+-----+---------+-------+------+
|      Jen|      Mary|   Brown|     |        F|BACKEND|    -1|
|  Michael|      Rose|        |40288|       RD|      M|  8000|
|    Maria|      Anne|   Jones|39192|      SRE|      F|  6000|
+---------+----------+--------+-----+---------+-------+------+



In [68]:
# 6 groupby and filtering

data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = (
    df.groupby("deparment").agg(
        F.sum("salary").alias("sum_salary"),
        F.avg("salary").alias("avg_salary"),
        F.max("salary").alias("max_salary"),
        F.min("salary").alias("min_salary"),
        F.count("salary").alias("count_rows"))
    .filter(C("sum_salary") > 0)
    
)

df_grp_department.show(n=5)
df_grp_department.printSchema()

+---------+----------+----------+----------+----------+----------+
|deparment|sum_salary|avg_salary|max_salary|min_salary|count_rows|
+---------+----------+----------+----------+----------+----------+
|       RD|      7000|    3500.0|      4000|      3000|         2|
|      SRE|      8000|    4000.0|      4000|      4000|         2|
+---------+----------+----------+----------+----------+----------+

root
 |-- deparment: string (nullable = true)
 |-- sum_salary: long (nullable = true)
 |-- avg_salary: double (nullable = true)
 |-- max_salary: integer (nullable = true)
 |-- min_salary: integer (nullable = true)
 |-- count_rows: long (nullable = false)



In [69]:
# 7 rank, dense_rank, and row_number
# https://stackoverflow.com/questions/44968912/difference-in-dense-rank-and-row-number-in-spark
# The window functions

data = [
    ("a",10),
    ('a',10),
    ('a',20)
]

columns = ['item','score']

df = spark.createDataFrame(data, columns)
df.show(n=5)

window_spec = W.partitionBy("item").orderBy("score")
df = (
    df.withColumn("rank", F.rank().over(window_spec))\
    .withColumn("dense_rank", F.dense_rank().over(window_spec))\
    .withColumn("row_number", F.row_number().over(window_spec))
)

df.show(n=5)

+----+-----+
|item|score|
+----+-----+
|   a|   10|
|   a|   10|
|   a|   20|
+----+-----+

+----+-----+----+----------+----------+
|item|score|rank|dense_rank|row_number|
+----+-----+----+----------+----------+
|   a|   10|   1|         1|         1|
|   a|   10|   1|         1|         2|
|   a|   20|   3|         2|         3|
+----+-----+----+----------+----------+



In [70]:
# 11 groupby and sum by a window function
data = [
    ("hotpop","Meat","url_1",),
    ("hotpop","Meat","url_2"),
    ("hotpop","Meat","url_3"),
    ("hotpop","Vegetable","url_4"),
    ("hotpop","Vegetable","url_5"),
    ("branch","Fried food","url_6"),
    ("branch","Dessert","ulr_7"),
  ]

columns = ["store_name","food_category","url"]
window_spec = w.partitionBy("store_name","food_category")

df = spark.createDataFrame(data = data, schema = columns)
df = (
    df.withColumn("food_cat_count", F.count("food_category").over(window_spec))
)
df.show()

+----------+-------------+-----+--------------+
|store_name|food_category|  url|food_cat_count|
+----------+-------------+-----+--------------+
|    branch|      Dessert|ulr_7|             1|
|    hotpop|         Meat|url_1|             3|
|    hotpop|         Meat|url_2|             3|
|    hotpop|         Meat|url_3|             3|
|    hotpop|    Vegetable|url_4|             2|
|    hotpop|    Vegetable|url_5|             2|
|    branch|   Fried food|url_6|             1|
+----------+-------------+-----+--------------+



In [71]:
# Difference between countdistinct


## melt operation

In [72]:
# 9 melt the dataframe (wide dataframe to long dataframe)
from typing import Iterable
import pandas as pd
from IPython.core.display import display
# https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe


def melt(
        df: DataFrame, 
        id_vars: Iterable[str], value_vars: Iterable[str], 
        var_name: str="variable", value_name: str="value") -> DataFrame:
    """Convert :class:`DataFrame` from wide to long format."""

    value_names_dtype = dict(df.select(value_vars).dtypes)
    unique_dtype = set(value_names_dtype.values())
    assert len(unique_dtype) == 1, f"value_vars should be the same dtype, your dype fo columns : {value_names_dtype}"

    # Create array<struct<variable: str, value: ...>>
    _vars_and_vals = F.array(*(
        F.struct(F.lit(c).alias(var_name), C(c).alias(value_name)) 
        for c in value_vars))

    # Add to the DataFrame and explode
    _tmp = df.withColumn("_vars_and_vals", F.explode(_vars_and_vals))

    cols = id_vars + [
            C("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
    return _tmp.select(*cols)

# pdf = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c', 4 : 'a'},
#                     'B': {0: 1, 1: 3, 2: 5, 4 : 11},
#                     'C': {0: 2, 1: 4, 2: 6, 4 : 12}
#                    })

# pdf_result = pd.melt(pdf, id_vars=['A'], value_vars=['B', 'C']).sort_values(by=['A'])


# display(
#     "Pandas",
#     pdf,
#     pdf_result,
#     "PySpark",
#        )

# # Case 1
# # pdf['C'] = pdf['C'].astype(str) # then you can convert to spark df
# sdf = spark.createDataFrame(pdf)
# sdf.show()
# sdf.printSchema()
# melt(sdf, id_vars=['A'], value_vars=['B', 'C']).show()

############### Case 2 ##############

data = [
    ("丹丹漢堡",                                                                
     "https://pic.pimg.tw/ksdelicacy/1438450572-4270481322.jpg",               
     "318",                                                                    
     "http://ksdelicacy.pixnet.net/blog/post/55774905",
     "ksdelicacy",                                                             
     "Fried food"
    ),
    ("拿坡里"   ,                                                                 
"https://rmfoodie.com/wp-content/uploads/rm/1446140572-1157042739_n.jpg",
"333",                                                                    
"http://rmlove30.pixnet.net/blog/post/61986505",                          
"RMlove30",                                                               
    "Bread")
]

cols = ['poi_name',
        'img_url',
        'food_cat_pop_score',
        'img_article_url',
        'img_author_id',
        'food_cat']

df = spark.createDataFrame(data, cols)
df.show(vertical=True, truncate=False)
df.printSchema()
melt(
        df,
        id_vars=['poi_name'],
        value_vars=["img_url",
                    'food_cat_pop_score',
                    "img_article_url",
                    "img_author_id",
                    'food_cat'
                    ],
        var_name = 'menu_key',value_name = 'menu_value'
        ).show(vertical=True)

-RECORD 0------------------------------------------------------------------------------------
 poi_name           | 丹丹漢堡                                                                   
 img_url            | https://pic.pimg.tw/ksdelicacy/1438450572-4270481322.jpg               
 food_cat_pop_score | 318                                                                    
 img_article_url    | http://ksdelicacy.pixnet.net/blog/post/55774905                        
 img_author_id      | ksdelicacy                                                             
 food_cat           | Fried food                                                             
-RECORD 1------------------------------------------------------------------------------------
 poi_name           | 拿坡里                                                                    
 img_url            | https://rmfoodie.com/wp-content/uploads/rm/1446140572-1157042739_n.jpg 
 food_cat_pop_score | 333                                   

## create complex type when aggregation

In [73]:
# 8 collect dict (map) with a group
# https://stackoverflow.com/questions/55308482/pyspark-create-dictionary-within-groupby

# collect_list : return a list of objects with duplicated
# collect_set : return a set of objects without duplicated
# struct : create a new struct column
# ( > 2.4.0)map_from_entries : returns a map created from the given array of entries
# create_map

######### pyspark < 2.4.0
data = [
    (1,'a',123),
    (1,'b',234),
    (1,'c',345),
    (2,'a',12),
    (2,'x',23),
    (2,'y',123)
]

columns = ['id','key','value']

df = spark.createDataFrame(data, columns)
df.show(n=5)

######## pyspark < 2.4.0

df_agg = df.groupBy("id").agg(
    F.collect_list(F.create_map(C("key"),C("value"))).alias('collections')
)

df_agg.printSchema()
df_agg.show(n=10, truncate=False)
print(df_agg.collect())
df_agg.toPandas().to_json('output/tmp.json',
                          orient='records',
                          force_ascii=False,
                          lines=True)


# to_json(join(SERVING_POI_FOOD_IMG_FOLDER,serving_fname),
#                                        orient='records',
#                                        force_ascii=False,
#                                        lines=True)
######### pyspark > 2.4.0
# df.groupBy("id").agg(
#     F.map_from_entries(
#         F.collect_list(
#             F.struct("key","value"))).alias("key_value")
# ).show()

############## Case 2 #######################
for i in range(3):
    print()
print('----------------- Case 2 -------------------')
for i in range(3):
    print()
    
data = [
    ("hotpop","Meat",3,"https//:123.png","rtyg11"),
    ("hotpop","Meat",3,"https//:456.png","rtyg11"),
    ("hotpop","Meat",3,"https//:789.png","rtyg11"),
    ("hotpop","Vegetable",2,"https//:111.png","rtyg11"),
    ("hotpop","Vegetable",2,"https//:222.png","rtyg11"),
    ("branch","Fried food",1,"https//:333.png","bvc1"),
    ("branch","Dessert",1,"https//:444.png","7854"),
  ]

columns = ["store_name","food_category","food_category_popularity","img_url","author_id"]
df = spark.createDataFrame(data = data, schema = columns)

df.show(n=10)


# convert the melt column to string
# because the column you wanna melt should be the same dtype
df = (
    df.withColumn("food_category_popularity", C("food_category_popularity").cast(StringType()))
)
df_complex = (
    melt(df, id_vars=['store_name'],
             value_vars=['food_category','food_category_popularity','img_url','author_id'],
             var_name = 'menu_key',value_name = 'menu_value'
        ).groupBy("store_name").agg(
        F.collect_list(F.create_map(C("menu_key"), C("menu_value"))).alias("menu")
    )
)

df_complex.show()
df_complex.collect()



+---+---+-----+
| id|key|value|
+---+---+-----+
|  1|  a|  123|
|  1|  b|  234|
|  1|  c|  345|
|  2|  a|   12|
|  2|  x|   23|
+---+---+-----+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- collections: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: long (valueContainsNull = true)

+---+------------------------------------+
|id |collections                         |
+---+------------------------------------+
|1  |[[a -> 123], [b -> 234], [c -> 345]]|
|2  |[[a -> 12], [x -> 23], [y -> 123]]  |
+---+------------------------------------+

[Row(id=1, collections=[{'a': 123}, {'b': 234}, {'c': 345}]), Row(id=2, collections=[{'a': 12}, {'x': 23}, {'y': 123}])]



----------------- Case 2 -------------------



+----------+-------------+------------------------+---------------+---------+
|store_name|food_category|food_category_popularity|        img_url|author_id|
+----------+-------------+------------

[Row(store_name='hotpop', menu=[{'food_category': 'Meat'}, {'food_category_popularity': '3'}, {'img_url': 'https//:123.png'}, {'author_id': 'rtyg11'}, {'food_category': 'Meat'}, {'food_category_popularity': '3'}, {'img_url': 'https//:456.png'}, {'author_id': 'rtyg11'}, {'food_category': 'Meat'}, {'food_category_popularity': '3'}, {'img_url': 'https//:789.png'}, {'author_id': 'rtyg11'}, {'food_category': 'Vegetable'}, {'food_category_popularity': '2'}, {'img_url': 'https//:111.png'}, {'author_id': 'rtyg11'}, {'food_category': 'Vegetable'}, {'food_category_popularity': '2'}, {'img_url': 'https//:222.png'}, {'author_id': 'rtyg11'}]),
 Row(store_name='branch', menu=[{'food_category': 'Fried food'}, {'food_category_popularity': '1'}, {'img_url': 'https//:333.png'}, {'author_id': 'bvc1'}, {'food_category': 'Dessert'}, {'food_category_popularity': '1'}, {'img_url': 'https//:444.png'}, {'author_id': '7854'}])]

In [74]:
# group by key, create a complexy json format like

# ...

In [75]:
# 12 groupby , concat the element in array

# collect in array list, and flatern then

columns = ["group","name","languagesAtSchool","currentState"]
data = [
    ('1',"James,,Smith",["Java","Scala","C++"],"CA"), \
    ('1',"Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ('2',"Robert,,Williams",["CSharp","VB"],"NV"),
    ('2',"Robert,,Williams",None,"NV")
]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

# collect list with collect list
# we need to explode it first

df_agg = (
    df
    .withColumn("flattern_tags",F.explode(C("languagesAtSchool")))
    .drop('languagesAtSchool')
    .groupBy("group")
    .agg(
        F.collect_list("flattern_tags").alias('languagesAtSchool')
    )
).show(truncate=False)

root
 |-- group: string (nullable = true)
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+-----+----------------+------------------+------------+
|group|name            |languagesAtSchool |currentState|
+-----+----------------+------------------+------------+
|1    |James,,Smith    |[Java, Scala, C++]|CA          |
|1    |Michael,Rose,   |[Spark, Java, C++]|NJ          |
|2    |Robert,,Williams|[CSharp, VB]      |NV          |
|2    |Robert,,Williams|null              |NV          |
+-----+----------------+------------------+------------+

+-----+------------------------------------+
|group|languagesAtSchool                   |
+-----+------------------------------------+
|1    |[Java, Scala, C++, Spark, Java, C++]|
|2    |[CSharp, VB]                        |
+-----+------------------------------------+



In [76]:
# 13 groupby and collect arrays into arrays
# Yes we can

columns = ["group","name","languagesAtSchool","currentState"]
data = [
    ('1',"James,,Smith",["Java","Scala","C++"],"CA"), \
    ('1',"Michael,Rose,",["Spark","Java","C++"],"NJ"), \
    ('2',"Robert,,Williams",["CSharp","VB"],"NV"),
    ('2',"Robert,,Williams",None,"NV")
]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

df_agg = (
 df.groupBy("group")
    .agg(
        F.collect_list(C("languagesAtSchool")).alias("tags")
    )
)

df_agg.show(truncate=False)


(
    df_agg
    .withColumn("explode_tags",F.explode(C("tags")))
).show(truncate=False)

root
 |-- group: string (nullable = true)
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)

+-----+----------------+------------------+------------+
|group|name            |languagesAtSchool |currentState|
+-----+----------------+------------------+------------+
|1    |James,,Smith    |[Java, Scala, C++]|CA          |
|1    |Michael,Rose,   |[Spark, Java, C++]|NJ          |
|2    |Robert,,Williams|[CSharp, VB]      |NV          |
|2    |Robert,,Williams|null              |NV          |
+-----+----------------+------------------+------------+

+-----+----------------------------------------+
|group|tags                                    |
+-----+----------------------------------------+
|1    |[[Java, Scala, C++], [Spark, Java, C++]]|
|2    |[[CSharp, VB]]                          |
+-----+----------------------------------------+

+-----+----------------------

In [77]:
# 14 groupby column C1 , get first row and last row once, order by column C2


data = [
    (38,"medicine"),
    (41,"medicine"),
    (55,"medicine"),
    (15,"technology"),
    (88,"technology"),
    (88,"technology"),
    (75,"technology"),
    (75,"mba"),
    (75,"mba"),
    (75,"mba")
    ]


columns = ['age','dept']

df = (
    spark.createDataFrame(data=data, schema=columns)
    .withColumn("id",
                F.row_number()
                .over(
                    W.orderBy(F.monotonically_increasing_id() - 1)
                ))

)


df.show()

# https://stackoverflow.com/questions/52273186/pyspark-spark-window-function-first-last-issue
print("This is wrong answer")

age_in_dept = W.partitionBy('dept').orderBy("age")
df_res = (
    df
    .withColumn("firstID", F.first('id').over(age_in_dept))
    .withColumn("lastID", F.last('id').over(age_in_dept))
)

df_res.show()

print('Because we prodive a orderBy clause, default frame is ')
print("RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW")

df_res.explain()

print("So we need to re-define the wibndow")

age_in_dept = (
    W.partitionBy('dept')
    .orderBy("age")
    .rowsBetween(W.unboundedPreceding, W.unboundedFollowing)
)


##### withcolumn, select gives you the same answer, you need to do your own filtering
df_res = (
    df
    .withColumn("firstID", F.first('id').over(age_in_dept))
    .withColumn("lastID", F.last('id').over(age_in_dept))
    .withColumn("age_rank_in_dept",
                F.row_number()
                .over(
                W.partitionBy('dept')
                 .orderBy("age")
                ))
)

df_res.explain()
df_res.show()


# df_res = (
#     df.select(
#         "*",
#         F.first('id').over(age_in_dept).alias('first_id'),
#         F.last('id').over(age_in_dept).alias('last_id'),
#     )
# )

# df_res.show()

+---+----------+---+
|age|      dept| id|
+---+----------+---+
| 38|  medicine|  1|
| 41|  medicine|  2|
| 55|  medicine|  3|
| 15|technology|  4|
| 88|technology|  5|
| 88|technology|  6|
| 75|technology|  7|
| 75|       mba|  8|
| 75|       mba|  9|
| 75|       mba| 10|
+---+----------+---+

This is wrong answer
+---+----------+---+-------+------+
|age|      dept| id|firstID|lastID|
+---+----------+---+-------+------+
| 75|       mba|  8|      8|    10|
| 75|       mba|  9|      8|    10|
| 75|       mba| 10|      8|    10|
| 38|  medicine|  1|      1|     1|
| 41|  medicine|  2|      1|     2|
| 55|  medicine|  3|      1|     3|
| 15|technology|  4|      4|     4|
| 75|technology|  7|      4|     7|
| 88|technology|  5|      4|     6|
| 88|technology|  6|      4|     6|
+---+----------+---+-------+------+

Because we prodive a orderBy clause, default frame is 
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
== Physical Plan ==
Window [first(id#2459, false) windowspecdefinition(dep

In [78]:
# 15. Perform collect_list, colect set in a wondow
data = [
    ("a",10),
    ('a',10),
    ('a',20),
    ('b',5),
    ('b',3)
]

columns = ['item','score']

df = spark.createDataFrame(data, columns)
df.show(n=5)

score_in_item_from_unbounded_to_curr = (
                W.partitionBy("item")
                 .orderBy("score"))
score_in_item = (
                W.partitionBy('item')
                 .orderBy("score")
                 .rowsBetween(W.unboundedPreceding, W.unboundedFollowing)
)
df_window = (
    df
    .withColumn("collect_score_list_to_current",
                F.collect_list("score").over(score_in_item_from_unbounded_to_curr))
    .withColumn("collect_score_set_to_current",
                F.collect_set("score").over(score_in_item_from_unbounded_to_curr))
    .withColumn("collect_score_list",
                F.collect_list("score").over(score_in_item)
               )
    .withColumn("collect_score_set",
                F.collect_set("score").over(score_in_item)
               )
)

df_window.show(n=5, vertical=True, truncate=False)

df_window.explain()

+----+-----+
|item|score|
+----+-----+
|   a|   10|
|   a|   10|
|   a|   20|
|   b|    5|
|   b|    3|
+----+-----+

-RECORD 0-------------------------------------
 item                          | b            
 score                         | 3            
 collect_score_list_to_current | [3]          
 collect_score_set_to_current  | [3]          
 collect_score_list            | [3, 5]       
 collect_score_set             | [5, 3]       
-RECORD 1-------------------------------------
 item                          | b            
 score                         | 5            
 collect_score_list_to_current | [3, 5]       
 collect_score_set_to_current  | [5, 3]       
 collect_score_list            | [3, 5]       
 collect_score_set             | [5, 3]       
-RECORD 2-------------------------------------
 item                          | a            
 score                         | 10           
 collect_score_list_to_current | [10, 10]     
 collect_score_set_to_current  | [10

In [81]:
# 15. Perform collect_list with a filter
# collect only the score > 0
# https://stackoverflow.com/questions/61468705/pyspark-using-collect-list-over-window-with-condition
data = [
    ("a",10),
    ('a',0),
    ('a',20),
    ('b',5),
    ('b',3),
    ('c',0)
]

columns = ['item','score']

df = spark.createDataFrame(data, columns)
df.show(n=5)

item = (
        W.partitionBy("item")
)


df_window = (
    df
    .withColumn("collect_score_list",
                F.collect_list(
                    F.when(C("score") > 0, C("score"))
                     .otherwise(F.lit(None))
                    ).over(item)
               )

)

df_window.show(n=5, vertical=True, truncate=False)

+----+-----+
|item|score|
+----+-----+
|   a|   10|
|   a|    0|
|   a|   20|
|   b|    5|
|   b|    3|
+----+-----+
only showing top 5 rows

-RECORD 0----------------------
 item               | c        
 score              | 0        
 collect_score_list | []       
-RECORD 1----------------------
 item               | b        
 score              | 5        
 collect_score_list | [5, 3]   
-RECORD 2----------------------
 item               | b        
 score              | 3        
 collect_score_list | [5, 3]   
-RECORD 3----------------------
 item               | a        
 score              | 10       
 collect_score_list | [10, 20] 
-RECORD 4----------------------
 item               | a        
 score              | 0        
 collect_score_list | [10, 20] 
only showing top 5 rows



# udf & pandas_udf (5+)

* pandas_udf return maximum 2G

* https://issues.apache.org/jira/browse/ARROW-1907

* [pandas_udf in classmethod, you need to write a new wrapper! which is not easy](https://stackoverflow.com/questions/58170261/how-to-use-pandas-udf-in-class)

In [None]:
# 1. Use Pyspark to send request, get image and store as b64string 
# https://stackoverflow.com/questions/49353752/use-requests-module-and-return-response-to-pyspark-dataframe

data = [
    (14431,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','img_url']

df = spark.createDataFrame(data=data, schema=columns)
print('before')
df.show(n=5)

@F.udf(returnType=StringType())
def get_img_binary(url : str) -> str:
    import requests
    import base64
    resp = requests.get(url)
    if resp.status_code == 200:
        return base64.encodestring(resp.content)
    else:
        return None
df = (
    df.withColumn("img_b64_str", get_img_binary(C("img_url")))
)

df.show(n=5)

In [None]:
# 2 udf return two column values, e.g. model prediction with label and probability
data = [
    (1,64),
    (2,76),
    (3,54),
    (4,11),
    (5,100),
]
columns = ['id','features']

df = spark.createDataFrame(data=data, schema=columns)
print("Before : ")
df.show(n=5)

############# sol #################
# using Row object to return multiple column
from pyspark.sql import Row

model_pred = StructType([
    StructField("category", StringType(), False),
    StructField("prob", T.FloatType(), False)
])

@F.udf(returnType=model_pred)
def model_pred(n):
    import random
    category = random.choice(['food','env','compose','drink'])
    prob = random.random()
    return Row('category', 'prob')(category, prob)



newDF = df.withColumn("pred", model_pred(df["features"]))

print(newDF.dtypes)

# newDF = newDF.select("id", "features", "pred.*")

newDF.show(truncate=False)


In [None]:
# 3. Use Pyspark to load a tf.keras model
# serieslize the model and make prediction

In [None]:
# 4. Pandas udf
# documentation and concept
# user defined function, but vectorlized by Arrow
# 2.3.0
# https://spark.apache.org/docs/2.3.0/sql-programming-guide.html#pandas-udfs-aka-vectorized-udfs
# 3.0 support more!
# https://spark.apache.org/docs/3.0.0/sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs


# for 2.3.0
#  Currently, there are two types of Pandas UDF: Scalar and Grouped Map.
#  Input pd.Series, Output pd.Series

# Scalar type
@F.pandas_udf(returnType=T.LongType())
def multiply_func(a, b):
    return a * b


x = pd.Series([1, 2, 3])

df = spark.createDataFrame(pd.DataFrame(x), schema=["x"])
df.select(multiply_func(C("x"), C("x"))).show()

In [None]:
# from pyspark.sql.functions import pandas_udf, PandasUDFType

# # Use pandas_udf to define a Pandas UDF
# @pandas_udf('double', PandasUDFType.SCALAR)
# # Input/output are both a pandas.Series of doubles

# def pandas_plus_one(x):
#     return x + 1

# df.withColumn('v2', pandas_plus_one(df.x))
# df.show()

In [None]:
# 5 pandasUDF return a dataframe
# like a model prediction
# # predict label and probability
# A grouped map UDF defines transformation:
# A pandas.DataFrame -> A pandas.DataFrame The returnType 
# should be a StructType describing the schema of the returned pandas.DataFrame.
# The length of the returned pandas.DataFrame can be arbitrary and the columns must be indexed so that their position matches the corresponding field in the schema.
# Grouped map UDFs are used with pyspark.sql.GroupedData.apply().
# https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html

# We can use BucketID for this ID

from pyspark.sql.functions import pandas_udf, PandasUDFType

df = spark.createDataFrame(
    [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
    ("ID", "v"))

df.show()
########## case 1 ##############

@pandas_udf("id long, v double, n_rows long", PandasUDFType.GROUPED_MAP)
def substract_mean(pdf):
    # pdf is a pandas.DataFrame
    n_rows = len(pdf)
    v = pdf.v
    return pdf.assign(
        v=v - v.mean(),
        n_rows=n_rows
    )

df.groupby("ID").apply(substract_mean).show()

######### case 2 ##############

@pandas_udf("ID long, v double, new_col_1 double, new_col_2 double", PandasUDFType.GROUPED_MAP)
def get_more_col(pdf):
    # pdf is a pandas.DataFrame
    import random
    n_counts = len(pdf)
    
    return pdf.assign(
        new_col_1 = [i for i in range(n_counts)],
        new_col_2 = [random.random() for i in range(n_counts)]
    )
# sdf groupby its bucket_id and apply

df.groupby("ID").apply(get_more_col).show()


######## case 3 #################

In [None]:
s = pd.Series([5,5,7,7,8])
for row in s:
    print(row)

In [None]:
# 6 pandas udf return Null used for sending request to get images
# return string for pandas_udf
# https://stackoverflow.com/questions/65694026/spark-exception-error-using-pandas-udf-with-logical-statement
from typing import Union
import requests
import base64


@F.udf(returnType=StringType())
def get_img_binary(url : str) -> str:
    import requests
    import base64
    resp = requests.get(url, timeout=10)
    if resp.status_code == 200:
        return base64.encodestring(resp.content)
    else:
        return None
# df = (
#     df.withColumn("img_b64_str", get_img_binary(C("img_url")))
# )


# If you wanna dealing with a lot of data, use the rep
# @F.pandas_udf('string', PandasUDFType.SCALAR)
def download_img_to_b64_pd(img_url : pd.Series) -> pd.Series:
    """
    get image in the response and save to base64 string
    data type flow : 
    resp.content - (bytes) 
    -> base64.encode - (bytes) 
    -> decode('utf-8') - str
    """
    b64_str_list = []
    for link in img_url:   
        try:
            resp = requests.get(link, timeout=10)
            if resp.status_code == 200:
                b64_str_list.append(base64.encodebytes(resp.content).decode('utf-8'))
            else:
                b64_str_list.append(None)
        except Exception as e:
            b64_str_list.append(None)
    return pd.Series(b64_str_list)



data = [
    (14431,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543947-3623_n.jpg'), # This one will be not found
    (14431,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','img_url']

df = spark.createDataFrame(data=data, schema=columns)
# df.show(vertical=True, truncate=False)

#             .withColumn(
#                 "bucket_id",
#                 F.udf(self.simple_random, returnType="integer")("bucket_size"),
#             )

df = (
    df\
#     .withColumn("img_b64_str", download_img_to_b64_pd(C("img_url")))
    .withColumn("img_b64_str",F.pandas_udf(download_img_to_b64_pd, "string",PandasUDFType.SCALAR)("img_url"))
    .filter(C("img_b64_str").isNotNull())
)

df.show()

In [None]:
def my_range(n):
    x = 0
    while True:
        if x < n:
            yield x
            x += 1 
        else:
            break

In [None]:
for i in my_range(10):
    print(i)

In [None]:
# 7 pandas udf using generator
# due to pd.Series return should under 2G
# we're using another apporoach (GroupMap)

def get_img_b64_generator(img_url : pd.Series):
    '''
    we can move this out of the udf
    '''
    for link in img_url:
        try:
            resp = requests.get(link, timeout=10)
            if resp.status_code == 200:
                yield base64.encodebytes(resp.content).decode('utf-8')
            else:
                yield None
        except Exception as e:
            yield None

            
return_type = "article_id long, img_url string"
@F.pandas_udf(return_type,
            F.PandasUDFType.GROUPED_MAP)
def download_img_to_b64(df_with_url : pd.DataFrame) -> pd.DataFrame:
    import requests
    import base64
    
    img_url_series = df_with_img_b64.img_url
    
    # Using generator to avoid big series OOM/Serilization error
    
    return df_with_url.assign(
        img_b64_str = img_type_series,
        )

data = [
    (14431,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543947-3623_n.jpg'), # This one will be not found
    (14431,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','img_url']

df = spark.createDataFrame(data=data, schema=columns)
pdf = df.toPandas()
pdf



# broadcasting (2+)

In [None]:
# 1
# broadcast the dictionary to spark 
# (which is a way that enhance multi-processing cross machine using your python code)
# the broadcast variable should be serializable

states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.show(n=5)

def state_convert(code):
    return broadcastStates.value[code]

# case 1, using rdd
result_rdd = df.rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).toDF(columns)
result_rdd.show(n=5)


# case 2, using pdf

@F.udf(returnType=StringType())
def state_convert_udf(code : str) -> str:
    return broadcastStates.value[code]

result_df = (
    df.withColumn("converted_state", state_convert_udf(C("state")))
)

result_df.show(n=5)

In [None]:
# 2
# Knowing broacsting object

# broadcast the dictionary to spark 
# (which is a way that enhance multi-processing cross machine using your python code)
# the broadcast variable should be serializable
# https://spark.apache.org/docs/2.3.3/api/python/_modules/pyspark/broadcast.html
states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)
print(type(broadcastStates), dir(broadcastStates))

# value to access the object
broadcastStates.value, type(broadcastStates.value)

In [None]:
# braordcast join

# Export dataframe(2+)

In [None]:
# 1
# write dataframe to jsonl format
# https://stackoverflow.com/questions/43269244/pyspark-dataframe-write-to-single-json-file-with-specific-name
# https://sparkbyexamples.com/pyspark/pyspark-read-json-file-into-dataframe/
data = [
    (14431,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','img_url']

df = spark.createDataFrame(data, columns)

fname_folder = join('output','jsonl_format_folder.json')
# This one will creat a folder contains part file for better multiple worker IO
df.coalesce(1).write.format('json').save(fname_folder, mode='overwrite')
df_new = spark.read.json(fname_folder)
df_new.show(n=10)
# However, if you wanna save it in a single file, use pandas
fname = join('output','jsonl_format.json')
# df.toPandas().to_json('path/file_name.json', orient='records', force_ascii=False, lines=True)
df.toPandas().to_json(fname, orient='records',force_ascii=False,lines=True)
df_new_pd = pd.read_json(fname,orient='records',lines=True)
df_new_pd

In [None]:
# 2 write parquet by date parittion


# 1
# write dataframe to jsonl format
# https://sparkbyexamples.com/pyspark/pyspark-read-and-write-parquet-file/
data_d1 = [
    (14431,20210224,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,20210224,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,20210224,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,20210224,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,20210224,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

data_d2 = [
    (86481,20210225,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (45213,20210225,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (24561,20210225,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (75371,20210225,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (25691,20210225,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

data_d3 = [
    (7861,20210304,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (45213,20210304,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (1111,20210304,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (76661,20210304,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (8888,20210304,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]


columns = ['article_id','date','img_url']

df_d1 = spark.createDataFrame(data_d1, columns)
df_d2 = spark.createDataFrame(data_d2, columns)
df_d3 = spark.createDataFrame(data_d3, columns)

for df in [df_d1, df_d2, df_d3]:
    df.show(n=5)

# save it
parquet_fname = join("output","save_by_date_partition.parquet")
for df in [df_d1, df_d2, df_d3]:
    df.write.parquet(parquet_fname, mode="overwrite", partitionBy="date")

# read by date range
start_date = 20210224
end_date = 20210301
# support int and daterange, string might be problem

new_df = spark.read.parquet(parquet_fname)\
         .where(C("date").between(start_date, end_date))

new_df.show()


In [None]:
# read it
start_date = 20210224
end_date = 20210304

# df.filter(df.year >= myYear)
new_df = spark.read.parquet(parquet_fname)
date_range_cond = (new_df.date >= start_date) & (new_df.date <= end_date)
new_df.filter(date_range_cond).show()

# Other (3+)

In [None]:
# 1
# rank the food category popularity by store_name but crossed and rotated


# create a mix ranking number
# create a popularity_rank_score in each store_name
    # Top popular in each store_name -> score 0
    # Second popular in each store_name -> score 0.1
# Add row number in each store_name, food_category -> cat_rank
# Create category_popularity_mix_rank_score = cat_rank + popularity_rank_score
# Sort the category_popularity_mix_rank_score by store_name

data = [
    ("hotpop","Meat",3,),
    ("hotpop","Meat",3),
    ("hotpop","Meat",3),
    ("hotpop","Vegetable",2),
    ("hotpop","Vegetable",2),
    ("branch","Fried food",1),
    ("branch","Dessert",1),
  ]

columns = ["store_name","food_category","food_category_popularity"]
df = spark.createDataFrame(data = data, schema = columns)

print('before')
df.show(n=10)

################# sol #######################
store_cat_pop_rank_score = F.when(C("store_cat_pop_rank") == 1, 0)\
                            .when(C("store_cat_pop_rank") == 2, 0.1)


window_sotre_cat_pop = W.partitionBy('store_name').orderBy(C("food_category_popularity").desc())
window_sotre_cat = W.partitionBy(['store_name','food_category']).orderBy(C("food_category"))
window_sotr_cat_mix_rank = W.partitionBy(['store_name']).orderBy(C("mix_cat_pop_rank_score"))
df = (
    df.withColumn("food_cat_pop_score", 100 * C("food_category_popularity") + 20 * F.randn(seed=42))\
      .withColumn("cat_idx", F.row_number().over(window_sotre_cat))\
      .withColumn("store_cat_pop_rank", F.dense_rank().over(window_sotre_cat_pop))
      .withColumn("store_cat_pop_rank_score", store_cat_pop_rank_score)\
      .withColumn("mix_cat_pop_rank_score", C("cat_idx") + C("store_cat_pop_rank_score"))\
      .withColumn("mix_cat_pop_rank", F.row_number().over(window_sotr_cat_mix_rank))\
)
df.toPandas()

In [None]:
# 2
# create a food category popularity score
# rank the food category popularity score but crossed and rotated



# create a mix ranking number
# create a popularity_rank_score in each store_name
    # Top popular in each store_name -> score 0
    # Second popular in each store_name -> score 0.1
# Add row number in each store_name, food_category -> cat_rank
# Create category_popularity_mix_rank_score = cat_rank + popularity_rank_score
# Sort the category_popularity_mix_rank_score by store_name

data = [
    ("hotpop","Meat",3,),
    ("hotpop","Meat",3),
    ("hotpop","Meat",3),
    ("hotpop","Vegetable",2),
    ("hotpop","Vegetable",2),
    ("branch","Fried food",1),
    ("branch","Dessert",1),
  ]

columns = ["store_name","food_category","food_category_popularity"]
df = spark.createDataFrame(data = data, schema = columns)

print('before')
df.show(n=10)

################# sol 1 #######################
store_cat_pop_rank_score = F.when(C("store_cat_pop_rank") == 1, 0)\
                            .when(C("store_cat_pop_rank") == 2, 0.1)


window_sotre_cat_pop = W.partitionBy('store_name').orderBy(C("food_category_popularity").desc())
window_sotre_cat = W.partitionBy(['store_name','food_category']).orderBy(C("food_cat_pop_score").desc())
window_sotr_cat_mix_rank = W.partitionBy(['store_name']).orderBy(C("mix_cat_pop_rank_score"))
df = (
    df.withColumn("food_cat_pop_score",
                  F.round(100 * C("food_category_popularity") + 20 * F.randn(seed=42))
                 )\
      .withColumn("cat_idx", F.row_number().over(window_sotre_cat))\
      .withColumn("store_cat_pop_rank", F.dense_rank().over(window_sotre_cat_pop))
      .withColumn("store_cat_pop_rank_score", store_cat_pop_rank_score)\
      .withColumn("mix_cat_pop_rank_score", C("cat_idx") + C("store_cat_pop_rank_score"))\
      .withColumn("mix_cat_pop_rank", F.row_number().over(window_sotr_cat_mix_rank))\
)

print('sol - 1')
display(df.toPandas())

############### sol 2 ########################

# Use another way to sort it
# sort by cat_idx and store_cat_pop_rank



category_id_in_poi = W.partitionBy(['store_name','food_category']).orderBy(C("food_category"))
category_popularity_rank_in_poi = W.partitionBy('store_name').orderBy(C("food_category_popularity").desc())
window_sotr_cat_mix_rank = W.partitionBy(['store_name']).orderBy(
    C("category_id_in_poi"),
    C("category_popularity_rank_in_poi")
)
df_sol_2 = (
    df.withColumn("food_cat_pop_score", 100 * C("food_category_popularity") + 20 * F.randn(seed=42))\
      .withColumn("category_id_in_poi", F.row_number().over(category_id_in_poi))\
      .withColumn("category_popularity_rank_in_poi", F.dense_rank().over(category_popularity_rank_in_poi))
      .withColumn("staggered_rank", F.row_number().over(window_sotr_cat_mix_rank))\
)

print('so1 - 2')

display(df_sol_2.toPandas())





In [None]:
# 3 Knowing the functions of dataframe operation 

dir(F)

In [None]:
# 4 explode_outer
# https://spark.apache.org/docs/2.3.0/api/python/pyspark.sql.html#pyspark.sql.GroupedData.apply

# return a new row for each element in the given array or map
# Unlike explode, if the array/map is null or empty
# the null is produced

df = spark.createDataFrame(
    [
        (1, ["foo","bar"], {"x" : 1.0}),
        (2, [], {}),
        (3, None, None)
    ],
    ("id", "an_array","a_map")
)

df.show()

df.select("id", F.explode_outer(C("a_map"))).show()

df.select("id", F.explode(C("a_map"))).show() # null thing will be nothing

df.select("id", F.explode_outer(C("an_array"))).show()

df.select("id", F.explode(C("an_array"))).show() # null thing will be nothing


In [None]:
# 4 iterate your dataframe row by row
# you will not use it in production
# but it is useful when debugging and develop your algorithm

df = spark.createDataFrame(
    [
        (1, ["foo","bar"], {"x" : 1.0}),
        (2, [], {}),
        (3, None, None)
    ],
    ("id", "an_array","a_map")
)


print('default you will get row object')
print('you can convert row into dataframe')
print('sometimes you might inidcate schema')
print('\n\n\n')
    
for row_idx, row in enumerate(df.rdd.toLocalIterator()):
    print(row_idx, type(row))
    slice_sdf = spark.createDataFrame([row],schema=df.schema)
    slice_sdf.show()

In [None]:
# cache and persistant

# cache and persist, spark provides an optimization mechanism to astore thre indermediate compurtation 
# of a Spark DataFrame so they can be resued in subsequent actions

# persist -> each node stores it's partitioned data in memory and reuse them in other actions on the datasaet

# Cost efficient - Spark computations are very expensive hence reusing the computations are used to save cost

# Time efficient - Reusing the repeated computations save lots of time.


# SparkDataFrame.cache() storage level `MEMORY_AND_DISK`
# RDD.cache() storage level `MEMORY_ONLY`


data = [
    ("hotpop","Meat",3,),
    ("hotpop","Meat",3),
    ("hotpop","Meat",3),
    ("hotpop","Vegetable",2),
    ("hotpop","Vegetable",2),
    ("branch","Fried food",1),
    ("branch","Dessert",1),
  ]

columns = ["store_name","food_category","food_category_popularity"]

# first df
df = spark.createDataFrame(data = data, schema = columns)
print('df1', df.explain())
# second stage

df2 = df.where(C("store_name") == "branch").cache()

print('df2', df2.explain())

# third stage
df3 = df2.where(
    C("food_category") == "Dessert"
)

print('df3', df3.explain())

df3.show()