In [1]:
# env : pixlake
# we focuing on pyspark dataframe processing
# documentation https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame
%load_ext autoreload
%autoreload 2

In [2]:
# make you auto compeletion faster
# https://stackoverflow.com/questions/40536560/ipython-and-jupyter-autocomplete-not-working
%config Completer.use_jedi = False

In [3]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['SPARK_HOME'] = '/opt/spark/versions/spark-2.3'

In [4]:
from pyspark.sql import SparkSession as Session
from pyspark import SparkConf as Conf
from pyspark.sql import functions as F, Window as W, types as T
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
C = F.col

In [5]:
conf = (Conf()
    .set('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .set('spark.driver.memory', '8g')
    .set('spark.driver.maxResultSize', '2g')
   )

In [6]:
spark = (Session
     .builder
     .appName('pyspark-challenge')
     .master('local[2]')
     .config(conf=conf)
     .getOrCreate())

# Creating DataFrame

In [7]:
# 0. know what spark session can do and its version
print(dir(spark), f'your spark version : {spark.version}'
      , sep='\n\n')

['Builder', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_convert_from_pandas', '_createFromLocal', '_createFromRDD', '_create_from_pandas_with_arrow', '_get_numpy_record_dtype', '_inferSchema', '_inferSchemaFromList', '_instantiatedSession', '_jsc', '_jsparkSession', '_jvm', '_jwrapped', '_repr_html_', '_sc', '_wrapped', 'builder', 'catalog', 'conf', 'createDataFrame', 'newSession', 'range', 'read', 'readStream', 'sparkContext', 'sql', 'stop', 'streams', 'table', 'udf', 'version']

your spark version : 2.3.4


In [8]:
# 1. read data from csv
print(os.listdir('../data'))
df_from_csv_1 = spark.read.csv('../data/zipcodes.csv',
                               header=True,
                              inferSchema=True)
df_from_csv_1.printSchema()
df_from_csv_1.limit(5).toPandas()

['zipcode1.json', 'titanic_train.csv', 'small_zipcode.csv', 'Meteorite_Landings.csv', 'zipcodes.csv', 'zipcodes.json', 'webpage_1.txt', 'multiline-zipcode.json', 'simple_text.txt', 'zipcode2.json', 'titanic_test.csv']
root
 |-- RecordNumber: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Xaxis: double (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- TaxReturnsFiled: integer (nullable = true)
 |-- EstimatedPopulation: integer (nullable = true)
 |-- TotalWages: integer (nullable = t

Unnamed: 0,RecordNumber,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Xaxis,Yaxis,Zaxis,WorldRegion,Country,LocationText,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages,Notes
0,1,704,STANDARD,PARC PARQUE,PR,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Parc Parque, PR",NA-US-PR-PARC PARQUE,False,,,,
1,2,704,STANDARD,PASEO COSTA DEL SUR,PR,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Paseo Costa Del Sur, PR",NA-US-PR-PASEO COSTA DEL SUR,False,,,,
2,10,709,STANDARD,BDA SAN LUIS,PR,NOT ACCEPTABLE,18.14,-66.26,0.38,-0.86,0.31,,US,"Bda San Luis, PR",NA-US-PR-BDA SAN LUIS,False,,,,
3,61391,76166,UNIQUE,CINGULAR WIRELESS,TX,NOT ACCEPTABLE,32.72,-97.31,-0.1,-0.83,0.54,,US,"Cingular Wireless, TX",NA-US-TX-CINGULAR WIRELESS,False,,,,
4,61392,76177,STANDARD,FORT WORTH,TX,PRIMARY,32.75,-97.33,-0.1,-0.83,0.54,,US,"Fort Worth, TX",NA-US-TX-FORT WORTH,False,2126.0,4053.0,122396986.0,


In [9]:
# 2 read data from json
print(os.listdir('../data'))
print(dir(spark.read))
# 沒有infer_schema
df_from_json = spark.read.json('../data/zipcodes.json')
df_from_json.printSchema()
df_from_json.limit(5).toPandas()

['zipcode1.json', 'titanic_train.csv', 'small_zipcode.csv', 'Meteorite_Landings.csv', 'zipcodes.csv', 'zipcodes.json', 'webpage_1.txt', 'multiline-zipcode.json', 'simple_text.txt', 'zipcode2.json', 'titanic_test.csv']
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_df', '_jreader', '_set_opts', '_spark', 'csv', 'format', 'jdbc', 'json', 'load', 'option', 'options', 'orc', 'parquet', 'schema', 'table', 'text']
root
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- EstimatedPopulation: long (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LocationText: string (nullable = 

Unnamed: 0,City,Country,Decommisioned,EstimatedPopulation,Lat,Location,LocationText,LocationType,Long,Notes,RecordNumber,State,TaxReturnsFiled,TotalWages,WorldRegion,Xaxis,Yaxis,Zaxis,ZipCodeType,Zipcode
0,PARC PARQUE,US,False,,17.96,NA-US-PR-PARC PARQUE,"Parc Parque, PR",NOT ACCEPTABLE,-66.22,,1,PR,,,,0.38,-0.87,0.3,STANDARD,704
1,PASEO COSTA DEL SUR,US,False,,17.96,NA-US-PR-PASEO COSTA DEL SUR,"Paseo Costa Del Sur, PR",NOT ACCEPTABLE,-66.22,,2,PR,,,,0.38,-0.87,0.3,STANDARD,704
2,BDA SAN LUIS,US,False,,18.14,NA-US-PR-BDA SAN LUIS,"Bda San Luis, PR",NOT ACCEPTABLE,-66.26,,10,PR,,,,0.38,-0.86,0.31,STANDARD,709
3,CINGULAR WIRELESS,US,False,,32.72,NA-US-TX-CINGULAR WIRELESS,"Cingular Wireless, TX",NOT ACCEPTABLE,-97.31,,61391,TX,,,,-0.1,-0.83,0.54,UNIQUE,76166
4,FORT WORTH,US,False,4053.0,32.75,NA-US-TX-FORT WORTH,"Fort Worth, TX",PRIMARY,-97.33,,61392,TX,2126.0,122396986.0,,-0.1,-0.83,0.54,STANDARD,76177


In [10]:
# 3 create dataframe from rdd list
columns = ["language","user_counts"]
data = [
    ("Java","20000"),
    ("Python","100000"),
    ("Scala","3000")
       ]
# 先分散到rdd
rdd = spark.sparkContext.parallelize(data)
print(dir(rdd), type(rdd), sep='\n\n')
print()
df_from_rdd = rdd.toDF(schema=columns)
df_from_rdd.show(n=5)

# 直接create，讓spark dataframe進行分散
df = spark.createDataFrame(data=data,schema=columns)
df.show(n=5)



['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_computeFractionForSampleSize', '_defaultReducePartitions', '_id', '_jrdd', '_jrdd_deserializer', '_memory_limit', '_pickled', '_reserialize', '_to_java_object_rdd', 'aggregate', 'aggregateByKey', 'cache', 'cartesian', 'checkpoint', 'coalesce', 'cogroup', 'collect', 'collectAsMap', 'combineByKey', 'context', 'count', 'countApprox', 'countApproxDistinct', 'countByKey', 'countByValue', 'ctx', 'distinct', 'filter', 'first', 'flatMap', 'flatMapValues', 'fold', 'foldByKey', 'foreach', 'foreachPartition', 'fullOuterJoin', 'getCheckpointFile', 'getNumPartitions', 'getStorageLevel', 'glom', 'groupBy', 'groupByKey', 'groupWith

In [11]:
# 4 create 5 row fake data using spark range
print(dir(spark))
print(type(spark.range(start=0,end=10)))
columns = ['row_number']
single_column_df = spark.range(start=0,end=10)
single_column_df.show(n=5)

['Builder', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_conf', '_convert_from_pandas', '_createFromLocal', '_createFromRDD', '_create_from_pandas_with_arrow', '_get_numpy_record_dtype', '_inferSchema', '_inferSchemaFromList', '_instantiatedSession', '_jsc', '_jsparkSession', '_jvm', '_jwrapped', '_repr_html_', '_sc', '_wrapped', 'builder', 'catalog', 'conf', 'createDataFrame', 'newSession', 'range', 'read', 'readStream', 'sparkContext', 'sql', 'stop', 'streams', 'table', 'udf', 'version']
<class 'pyspark.sql.dataframe.DataFrame'>
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows



In [12]:
# 6 create empty dataframe

columns = ["language","user_counts"]

# empty RDD + schema won't work
# df_1 = spark.createDataFrame(data=spark.sparkContext.emptyRDD(),
#                              schema=columns)
# df_1.show(n=5)

# empty list -> rdd -> df won't work
# df2 = spark.sparkContext.parallelize([]).toDF(columns)

# df3 = spark.createDataFrame([]) # won't work this version

In [13]:
# 7 get dataframe shape
print(df.count(), len(df.columns))

3 2


In [14]:
# 8 know what methods are supported by sql.function
print(dir(F))



In [15]:
# 9 add const column to a existing dataframe

columns = ["language","user_counts"]
data = [
    ("Java","20000"),
    ("Python","100000"),
    ("Scala","3000")
       ]
df = spark.createDataFrame(data=data,schema=columns)
df = (
    df.withColumn("new_column",F.lit("ABC")) 
    # F.lit means literal, retrurn a column
)
df.show(n=5)

+--------+-----------+----------+
|language|user_counts|new_column|
+--------+-----------+----------+
|    Java|      20000|       ABC|
|  Python|     100000|       ABC|
|   Scala|       3000|       ABC|
+--------+-----------+----------+



In [16]:
# 10 add a row_id column from a exisiting dataframe
# https://stackoverflow.com/questions/53082891/adding-a-unique-consecutive-row-number-to-dataframe-in-pyspark
columns = ["language","user_counts"]
data = [
    ("Java","20000"),
    ("Python","100000"),
    ("Scala","3000")
       ]
df_1 = spark.createDataFrame(data=data,schema=columns)
df_1 = (
    df_1.withColumn("index", 
                  F.row_number().over(
                      W.orderBy(F.monotonically_increasing_id() - 1)
                  )
                 )
    # F.monotonically_increasing_id does not give 1 ~ N
    # So we use window function to work around
)
df_1.show(n=5)

# Mre clear way to do that
df_2 = spark.createDataFrame(data=data,schema=columns)
w = W.orderBy(F.lit('A'))
df_2 = (
    df_2.withColumn("row_num", F.row_number().over(w))
)
df_2.show(n=5)

+--------+-----------+-----+
|language|user_counts|index|
+--------+-----------+-----+
|    Java|      20000|    1|
|  Python|     100000|    2|
|   Scala|       3000|    3|
+--------+-----------+-----+

+--------+-----------+-------+
|language|user_counts|row_num|
+--------+-----------+-------+
|    Java|      20000|      1|
|  Python|     100000|      2|
|   Scala|       3000|      3|
+--------+-----------+-------+



In [17]:
# 10 add a random number to a exisit column
df = (
#     df.withColumn('random_number', F.when(F.rand() > 0.5, 1).otherwise(0))
        df.withColumn('random_number', F.rand())
)

df.show(n=5)

+--------+-----------+----------+------------------+
|language|user_counts|new_column|     random_number|
+--------+-----------+----------+------------------+
|    Java|      20000|       ABC|0.4006027800520249|
|  Python|     100000|       ABC|0.5876548641605677|
|   Scala|       3000|       ABC| 0.843797467557134|
+--------+-----------+----------+------------------+



In [18]:
# 11 add a binary 0, 1 based on condition to an exisit column
df = (
    df.withColumn('binary_cut_05',F.when(F.rand() > 0.5, 1).otherwise(0))
)
df.limit(5).toPandas()

Unnamed: 0,language,user_counts,new_column,random_number,binary_cut_05
0,Java,20000,ABC,0.400603,0
1,Python,100000,ABC,0.587655,1
2,Scala,3000,ABC,0.843797,0


In [19]:
# 12 create a dataframe contains row_index and fake data

columns = ["row_id","language","user_counts"]
data = [
    (0, "Java","20000"),
    (1, "Python","100000"),
    (2, "Scala","3000")
       ]

df = spark.createDataFrame(data=data, schema=columns)
df.show(n=5)


+------+--------+-----------+
|row_id|language|user_counts|
+------+--------+-----------+
|     0|    Java|      20000|
|     1|  Python|     100000|
|     2|   Scala|       3000|
+------+--------+-----------+



In [20]:
# 13 construct a complex data for spark dataframe
# using StructType


# Case 1
data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 4000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)
df.show(n=5)

# Case 2

struct_data = [
    (("James","","Smith"),"36636","M", 3000),
    (("Michael","Rose",""),"40288","M", 4000),
    (("Robert","","Williams"),"42114","M", 4000),
    (("Maria","Anne","Jones"),"39192","F", 4000),
    (("Jen","Mary","Brown"),"","F", -1)
]

structure_schema = StructType([
    StructField('name',
        StructType([
            StructField("firstname",StringType(), True), # Nullable True
            StructField("middlename",StringType(), True),
            StructField("lastname",StringType(), True),
    ])),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])

df = spark.createDataFrame(data=struct_data, schema=structure_schema)
df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    [James, , Smith]|36636|     M|  3000|
|   [Michael, Rose, ]|40288|     M|  4000|
|[Robert, , Williams]|42114|     M|  4000|
|[Maria, Anne, Jones]|39192|     F|  4000|
|  [Jen, Mary, Brown]|     |     F|    -1|
+--------------------+-----+------+------+



In [21]:
# 14 construct a complex data for spark dataframe
# using ArratyType

In [22]:
# 15 construct a complex data for spark dataframe
# using MapType

In [23]:
# 16 create a datetime column for spark dataframe

# Column Operations

In [24]:
# 1 create new column based on original column

columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data=data, schema=columns)
df = (
    df.withColumn("user_count_100", C("user_counts") * 100)\
    .withColumn("user_count_log", F.log10(C("user_counts")))
)

df.show(n=5)

+--------+-----------+--------------+------------------+
|language|user_counts|user_count_100|    user_count_log|
+--------+-----------+--------------+------------------+
|    Java|      20000|       2000000| 4.301029995663981|
|  Python|     100000|      10000000|               5.0|
|   Scala|       3000|        300000|3.4771212547196626|
+--------+-----------+--------------+------------------+



In [25]:
# 2 rename, drop, add constant column to existing dataframe
# https://stackoverflow.com/questions/34077353/how-to-change-dataframe-column-names-in-pyspark
columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)
df = (
    df.withColumn('const_col',F.lit('ABC'))\
    .withColumnRenamed("language","lang")\
    .drop("user_counts")
)
df.show(n=5)

+------+---------+
|  lang|const_col|
+------+---------+
|  Java|      ABC|
|Python|      ABC|
| Scala|      ABC|
+------+---------+



In [26]:
# know what method and attribute can be called with column object


print(type(C("language")), dir(C("language")), sep='\n\n')

# alias - 可以換名字
# asc, desc - 可以排序
# astype,cast - 可以轉型
# between - 可以傳入start_date以及end_date過濾
# bitwiseAND, bitwiseOR, bitwiseXOR - 可以做布林運算
# contains - 可以做字串搜尋
# endwith, startwith, rlike, substring - 可以做字串比對
# eqNullSafe, isNotNull, isNull - 可以檢查null值，Python須以None傳入
# isin, like - 可以做值的比對(數值，字串值)
# name - 可以取得欄位名稱
# when, otherwise - 可以做條件判斷


<class 'pyspark.sql.column.Column'>

['__add__', '__and__', '__bool__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__invert__', '__iter__', '__le__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rsub__', '__rtruediv__', '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '_asc_doc', '_bitwiseAND_doc', '_bitwiseOR_doc', '_bitwiseXOR_doc', '_contains_doc', '_desc_doc', '_endswith_doc', '_eqNullSafe_doc', '_isNotNull_doc', '_isNull_doc', '_jc', '_like_doc', '_rlike_doc', '_startswith_doc', 'alias', 'asc', 'astype', 'between', 'bitwiseAND', 'bitwiseOR', 'bitwiseXOR

In [27]:
# 3 create a new dynamic column(if else condition based on old column)
# Case 1
columns = ["language", "user_counts"]
data = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

df = spark.createDataFrame(data, columns)

df = (
    df.withColumn("is_many_users",
                  F.when(C('user_counts') > 50000, 1).otherwise(0)
                 )
)

df.show(n=5)

+--------+-----------+-------------+
|language|user_counts|is_many_users|
+--------+-----------+-------------+
|    Java|      20000|            0|
|  Python|     100000|            1|
|   Scala|       3000|            0|
+--------+-----------+-------------+



In [28]:
# create a new dynamic column(if else condition based on old column) plus empty string replacement

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 4000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

# we cannot compare column values with empty string
# so the work-around method is replace empty string to null
# then using isNotNull()
is_full_name_exist = (C("firstname").isNotNull() & C("middlename").isNotNull() & C("lastname").isNotNull())


def blank_as_null(x):
    """
    helper function for converting row value from empty string to null
    https://stackoverflow.com/questions/33287886/replace-empty-strings-with-none-null-values-in-dataframe
    """
    return F.when(C(x) != "", C(x)).otherwise(None)

print("Before")
df.show(n=5)
df = (
    df.withColumn("firstname", blank_as_null("firstname"))\
    .withColumn("middlename", blank_as_null("middlename"))\
    .withColumn("lastname", blank_as_null("lastname"))\
    .withColumn("full_name",
                  F.when(
                      is_full_name_exist,
                      F.concat(C("firstname"), F.lit(' '),
                               C("middlename"), F.lit(' '),
                               C("lastname"))
                  ).otherwise(F.lit('N/A'))
                 )
)
print("After")
df.show(n=5)

Before
+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+

After
+---------+----------+--------+-----+------+------+----------------+
|firstname|middlename|lastname|   id|gender|salary|       full_name|
+---------+----------+--------+-----+------+------+----------------+
|    James|      null|   Smith|36636|     M|  3000|             N/A|
|  Michael|      Rose|    null|40288|     M|  4000|             N/A|
|   Robert|      null|Williams|42114|     M|  4000|             N/A|
|    Maria|      Anne|   Jones|39192|     F|  4000|Maria Anne Jones|
|      Jen|      Mary|   Brown|    

In [29]:
# 4 select columns which also exist on another dataframe

columns_1 = ["language", "user_counts"]
data_1 = [
    ("Java",20000),
    ("Python",100000),
    ("Scala",3000)
]

columns_2 = ["language", "user_counts","note"]
data_2 = [
    ("Java",20000,"nothing"),
    ("Python",100000,"nothing"),
    ("Scala",3000,"nothing")
]

df_1 = spark.createDataFrame(data_1, columns_1)
df_2 = spark.createDataFrame(data_2, columns_2)

# union columns
same_cols = [F.col(c) for c in df_2.columns if c in df_1.columns]
print(same_cols, type(same_cols), type(same_cols[0]))

df_same_col = df_1.select(*same_cols)
df_same_col.show(n=5)

[Column<b'language'>, Column<b'user_counts'>] <class 'list'> <class 'pyspark.sql.column.Column'>
+--------+-----------+
|language|user_counts|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



# String Operation

In [30]:
# 5 string concat two column values to a new column from an existing dataframe

In [31]:
# 6 cut of left 3 char of specific column to a new column from an existing dataframe

In [32]:
# 7 convert string type column to int/float type column

In [33]:
# 8 convert string type column to datetime type column

# Common Patterns

## Filtering

In [34]:
# 1. filter on equal condition

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 4000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.filter(C("gender") == "M")
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
+---------+----------+--------+-----+------+------+



In [35]:
# 2 filter on >, <, >=, <=

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.filter(C("salary") >= 4000)
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  6000|
+---------+----------+--------+-----+------+------+



In [36]:
# 3 multiple conditions require parenthese around each condition


data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

# This is lazy computing
rich_man_who_worth_married = (
    (C("gender") == "M") &
    (C("salary") >= 4000)
)

df = (
    df.filter(rich_man_who_worth_married)
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
+---------+----------+--------+-----+------+------+



In [37]:
# 4 Compare against a list of allowed values


data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.filter(C("gender").isin(["F"]))
)

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    Maria|      Anne|   Jones|39192|     F|  6000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [38]:
# 5 Sort result

data = [
    ("James","","Smith","36636","M", 3000),
    ("Michael","Rose","","40288","M", 4000),
    ("Robert","","Williams","42114","M", 4000),
    ("Maria","Anne","Jones","39192","F", 6000),
    ("Jen","Mary","Brown","","F", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)


df = df.orderBy(C("salary").desc())

df.show(n=5)

df = df.orderBy(C("salary").asc())

df.show(n=5)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    Maria|      Anne|   Jones|39192|     F|  6000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    James|          |   Smith|36636|     M|  3000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|      Jen|      Mary|   Brown|     |     F|    -1|
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  6000|
+---------+----------+--------+-----+------+------+



In [61]:
# 6 select distinct rows based on certain column but keep first row
# In this case, model prediction to filter the same images

############## DROP DUPLICATED doesn't work in this case
# https://stackoverflow.com/questions/38687212/spark-dataframe-drop-duplicates-and-keep-first
data = [
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,0.99834,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,0.97611,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,0.93422,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,0.94231,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','pred','img_url']

df = spark.createDataFrame(data=data, schema=columns)
print('before')

df.orderBy('pred').show(n=5)

print('Sol 1 FAILED SOMETIMES WHEN PARTITION != 1')
df_1 = (
    df.drop_duplicates(subset=["pred"])
)

df_1.orderBy('pred').show(n=5)


############## Using Window Function and sort, rank, worked!
# You can check the 5 th question of Aggregation, The solution is the same

print('Sol 2 WORKED WITH ANY PARTITION')
df_2 = (
    df.withColumn("rank_by_pred",
                  F.row_number().over(
                  W.partitionBy("pred")\
                      .orderBy(F.desc("pred"))
                  )
                 )\
    .filter(F.col("rank_by_pred") == 1)\
    .drop('rank_by_pred')
)
df_2.orderBy('pred').show(n=5)

before
+----------+-------+--------------------+
|article_id|   pred|             img_url|
+----------+-------+--------------------+
|     67789|0.93422|https://pic.pimg....|
|     67789|0.94231|https://pic.pimg....|
|     14431|0.97611|https://pic.pimg....|
|     14431|0.99834|https://pic.pimg....|
|     14431|0.99834|https://pic.pimg....|
+----------+-------+--------------------+

Sol 1 FAILED SOMETIMES WHEN PARTITION != 1
+----------+-------+--------------------+
|article_id|   pred|             img_url|
+----------+-------+--------------------+
|     67789|0.93422|https://pic.pimg....|
|     67789|0.94231|https://pic.pimg....|
|     14431|0.97611|https://pic.pimg....|
|     14431|0.99834|https://pic.pimg....|
+----------+-------+--------------------+

Sol 2 WORKED WITH ANY PARTITION
+----------+-------+--------------------+
|article_id|   pred|             img_url|
+----------+-------+--------------------+
|     67789|0.93422|https://pic.pimg....|
|     67789|0.94231|https://pic.pi

# Aggregation Operations

In [39]:
# 1 knowing the groupby object method

data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("apartment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp = df.groupBy("salary")

print(type(df_grp), dir(df_grp), sep='\n\n')

# avg, count, max, mean, sum  - Common aggregation
# pivot - two column x, y with value in the table
# sql_ctx - apply sql command
# custom function - agg, apply


<class 'pyspark.sql.group.GroupedData'>

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_df', '_jgd', 'agg', 'apply', 'avg', 'count', 'max', 'mean', 'min', 'pivot', 'sql_ctx', 'sum']


In [40]:
# 2 apply single aggregation fuction on groupby object
data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = (
    df.groupby("deparment").mean("salary").alias("mean_salary")
)

df_grp_department.show(n=5)

+---------+-----------+
|deparment|avg(salary)|
+---------+-----------+
|        F|       -1.0|
|       RD|     3500.0|
|      SRE|     4000.0|
+---------+-----------+



In [41]:
# 3 apply multiple aggregation fuction on groupby object
data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = df.groupby("deparment").agg(
    F.sum("salary").alias("sum_salary"),
    F.avg("salary").alias("avg_salary"),
    F.max("salary").alias("max_salary"),
    F.min("salary").alias("min_salary"),
    F.count("salary").alias("group_size")
)

df_grp_department.show(n=5)

+---------+----------+----------+----------+----------+----------+
|deparment|sum_salary|avg_salary|max_salary|min_salary|group_size|
+---------+----------+----------+----------+----------+----------+
|        F|        -1|      -1.0|        -1|        -1|         1|
|       RD|      7000|    3500.0|      4000|      3000|         2|
|      SRE|      8000|    4000.0|      4000|      4000|         2|
+---------+----------+----------+----------+----------+----------+



In [42]:
# 4 collect data point for each group with the stats(min, max, sum, avg, count)


# apply multiple aggregation fuction on groupby object
data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = df.groupby("deparment").agg(
    F.sum("salary").alias("sum_salary"),
    F.avg("salary").alias("avg_salary"),
    F.max("salary").alias("max_salary"),
    F.min("salary").alias("min_salary"),
    F.count("salary").alias("count_rows"),
    F.collect_list("salary").alias("all_rows")
)

df_grp_department.toPandas()


Unnamed: 0,deparment,sum_salary,avg_salary,max_salary,min_salary,count_rows,all_rows
0,F,-1,-1.0,-1,-1,1,[-1]
1,RD,7000,3500.0,4000,3000,2,"[3000, 4000]"
2,SRE,8000,4000.0,4000,4000,2,"[4000, 4000]"


In [43]:
# 5 get first one row in each group
# We use Window Function here
# Key to think about this, we rank the data in each group, then 
# filtering
# no nothing is groupby
# which is different in pandas

data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 8000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 6000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])


df = spark.createDataFrame(data=data,schema=schema)

df = (
    df.withColumn("rank_salary_by_deparment",
                  F.row_number().over(
                  W.partitionBy("deparment")\
                      .orderBy(F.desc("salary"))
                  )
                 )\
    .filter(F.col("rank_salary_by_deparment") == 1)\
    .drop('rank_salary_by_deparment')
)

df.show(n=5)


+---------+----------+--------+-----+---------+-------+------+
|firstname|middlename|lastname|   id|deparment| gender|salary|
+---------+----------+--------+-----+---------+-------+------+
|      Jen|      Mary|   Brown|     |        F|BACKEND|    -1|
|  Michael|      Rose|        |40288|       RD|      M|  8000|
|    Maria|      Anne|   Jones|39192|      SRE|      F|  6000|
+---------+----------+--------+-----+---------+-------+------+



In [44]:
# 6 groupby and filtering


data = [
    ("James","","Smith","36636","RD","M", 3000),
    ("Michael","Rose","","40288","RD","M", 4000),
    ("Robert","","Williams","42114","SRE","M", 4000),
    ("Maria","Anne","Jones","39192","SRE","F", 4000),
    ("Jen","Mary","Brown","","F","BACKEND", -1)
]

schema = StructType([
    StructField("firstname",StringType(), True), # Nullable True
    StructField("middlename",StringType(), True),
    StructField("lastname",StringType(), True),
    StructField("id",StringType(), True),
    StructField("deparment",StringType(), True),
    StructField("gender",StringType(), True),
    StructField("salary", IntegerType(), True)
    ])
df = spark.createDataFrame(data=data,schema=schema)

df_grp_department = (
    df.groupby("deparment").agg(
        F.sum("salary").alias("sum_salary"),
        F.avg("salary").alias("avg_salary"),
        F.max("salary").alias("max_salary"),
        F.min("salary").alias("min_salary"),
        F.count("salary").alias("count_rows"))
    .filter(C("sum_salary") > 0)
    
)

df_grp_department.show(n=5)
df_grp_department.printSchema()

+---------+----------+----------+----------+----------+----------+
|deparment|sum_salary|avg_salary|max_salary|min_salary|count_rows|
+---------+----------+----------+----------+----------+----------+
|       RD|      7000|    3500.0|      4000|      3000|         2|
|      SRE|      8000|    4000.0|      4000|      4000|         2|
+---------+----------+----------+----------+----------+----------+

root
 |-- deparment: string (nullable = true)
 |-- sum_salary: long (nullable = true)
 |-- avg_salary: double (nullable = true)
 |-- max_salary: integer (nullable = true)
 |-- min_salary: integer (nullable = true)
 |-- count_rows: long (nullable = false)



# udf

In [45]:
# 1. Use Pyspark to send request, get image and store as b64string 
# https://stackoverflow.com/questions/49353752/use-requests-module-and-return-response-to-pyspark-dataframe

data = [
    (14431,'https://pic.pimg.tw/happy78/1528543947-685380499_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543947-362759723_n.jpg'),
    (14431,'https://pic.pimg.tw/happy78/1528543962-2265924582_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-4007835890_n.jpg'),
    (67789,'https://pic.pimg.tw/happy78/1528543962-45890_n.jpg')
]

columns = ['article_id','img_url']

df = spark.createDataFrame(data=data, schema=columns)
print('before')
df.show(n=5)

@F.udf(returnType=StringType())
def get_img_binary(url : str) -> str:
    import requests
    import base64
    resp = requests.get(url)
    if resp.status_code == 200:
        return base64.encodestring(resp.content)
    else:
        return None
df = (
    df.withColumn("img_b64_str", get_img_binary(C("img_url")))
)

df.show(n=5)

before
+----------+--------------------+
|article_id|             img_url|
+----------+--------------------+
|     14431|https://pic.pimg....|
|     14431|https://pic.pimg....|
|     14431|https://pic.pimg....|
|     67789|https://pic.pimg....|
|     67789|https://pic.pimg....|
+----------+--------------------+

+----------+--------------------+-----------+
|article_id|             img_url|img_b64_str|
+----------+--------------------+-----------+
|     14431|https://pic.pimg....|[B@67cbd78a|
|     14431|https://pic.pimg....|[B@28ab9f8c|
|     14431|https://pic.pimg....|[B@6d77a5a1|
|     67789|https://pic.pimg....|[B@305106b4|
|     67789|https://pic.pimg....|       null|
+----------+--------------------+-----------+



In [46]:
from pyspark.sql import Row
Row()

<Row()>

In [48]:
# udf return two column values, e.g. model prediction with label and probability

# import numpy as np
# from typing import List

# class FakeModel:
#     def __init__(self, n_class : int):
#         self.n_class = n_class
#     def predict(self):
#         return np.random.random(size=(self.n_class, ))

# catlog = ["food", "env", "drink", "compose"]

# m = FakeModel(n_class=4)
# m.predict()

data = [
    (1,64),
    (2,76),
    (3,54),
    (4,11),
    (5,100),
]
columns = ['id','features']

df = spark.createDataFrame(data=data, schema=columns)
print("Before : ")
df.show(n=5)

# # Strategy 1 

# # Struct Type, explode the Array
# # https://stackoverflow.com/questions/35322764/apache-spark-assign-the-result-of-udf-to-multiple-dataframe-columns

# model_pred = StructType([
#         StructField("class", StringType(), False),
#         StructField("prob", T.FloatType(), False)
# ])

# @F.udf(returnType=model_pred)
# def get_prediction(x : int) -> model_pred:

#     # setting code
#     import numpy as np
#     class FakeModel:
#         def __init__(self, n_class : int):
#             self.n_class = n_class
#         def predict(self):
#             return np.random.random(size=(self.n_class, ))
#     m = FakeModel(n_class=4)
#     catlog = ["food", "env", "drink", "compose"]
#     # preprocessing code
#     # ...
    
#     pred = m.predict()
#     idx = np.argmax(pred)
# #     return  [catlog[idx], pred[idx]] # There is a showString issue, use Row object
#     return Row('class','prob')(catlog[idx], pred[idx])

    
# df = (
#     df.withColumn("prediction", get_prediction(C("features")))
# )

# df.printSchema()
# df.show(n=5)


# df = spark.createDataFrame([("Alive", 4)], ["Name", "Number"])

model_pred = StructType([
    StructField("category", StringType(), False),
    StructField("prob", T.FloatType(), False)])

@F.udf(returnType=model_pred)
def model_pred(n):
    import random
    category = random.choice(['food','env','compose','drink'])
    prob = random.random()
    return Row('category', 'prob')(category, prob)


newDF = df.withColumn("pred", model_pred(df["features"]))
newDF = newDF.select("id", "features", "pred.*")

newDF.show(truncate=False)


Before : 
+---+--------+
| id|features|
+---+--------+
|  1|      64|
|  2|      76|
|  3|      54|
|  4|      11|
|  5|     100|
+---+--------+

+---+--------+--------+----------+
|id |features|category|prob      |
+---+--------+--------+----------+
|1  |64      |env     |0.94676584|
|2  |76      |food    |0.50463367|
|3  |54      |drink   |0.60564864|
|4  |11      |compose |0.38281012|
|5  |100     |drink   |0.61121887|
+---+--------+--------+----------+



In [None]:
df = spark.createDataFrame([("Alive", 4)], ["Name", "Number"])


def example(n):
    return Row('Out1', 'Out2')(n + 2, n - 2)


schema = StructType([
    StructField("Out1", IntegerType(), False),
    StructField("Out2", IntegerType(), False)])

example_udf = F.UserDefinedFunction(example, schema)

newDF = df.withColumn("Output", example_udf(df["Number"]))
newDF = newDF.select("Name", "Number", "Output.*")

newDF.show(truncate=False)


In [None]:
# 2. Use Pyspark to load a tf.keras model
# serieslize the model and make prediction

In [None]:
# 3. Pandas udf
# documentation and concept

# broadcasting

In [None]:
# broadcast the dictionary to spark 
# (which is a way that enhance multi-processing cross machine using your python code)
# the broadcast variable should be serializable

states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.show(n=5)

def state_convert(code):
    return broadcastStates.value[code]

# case 1, using rdd
result_rdd = df.rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).toDF(columns)
result_rdd.show(n=5)


# case 2, using pdf

@F.udf(returnType=StringType())
def state_convert_udf(code : str) -> str:
    return broadcastStates.value[code]

result_df = (
    df.withColumn("converted_state", state_convert_udf(C("state")))
)

result_df.show(n=5)

In [None]:
# Knowing broacsting object


# broadcast the dictionary to spark 
# (which is a way that enhance multi-processing cross machine using your python code)
# the broadcast variable should be serializable
# https://spark.apache.org/docs/2.3.3/api/python/_modules/pyspark/broadcast.html
states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)
print(type(broadcastStates), dir(broadcastStates))

# value to access the object
broadcastStates.value, type(broadcastStates.value)