In [None]:
### create spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("medistream-05") \
        .getOrCreate()

In [None]:
### get id column in data (feat.alias)
from pyspark.sql.functions import col
df = data.select(
    col("id").alias("id")
)

In [None]:
### regexp_replace
from pyspark.sql.functions import regexp_replace
df = data.withColumn(
    "description",
    regexp_replace("description", "[\n\r*,]", "") 
)

In [None]:
# get column's length
from pyspark.sql.functions import length
df = data.withColumn(
    "description_length",
    length("description")    
)

In [None]:
# count column's size
from pyspark.sql.functions import size
df = data.withColumn(
    "images_count", 
    size("images")
)

In [None]:
# string + list => list
from pyspark.sql.functions import flatten, array
df = data.withColumn(
    # get homepages' urls
    'homepages_url', 
    flatten(array(array('homepages_repr.url'), 'homepages_etc.url'))
)

In [None]:
# startswith
from pyspark.sql.functions import startswith
df = data.withColumn(
    # get homepages' urls
    'is_smart_phone',
    col('phone').startswith('010')
)

In [None]:
# array contains
from pyspark.sql.functions import array_contains
df = data.withColumn(
    'is_zero_pay',
    array_contains(col('payment_info'), '제로페이')
)

In [None]:
# extract list[0] & -> col
from pyspark.sql.functions import array_contains
df = data.withColumn(
    'keywords_1',
    col('keywords')[0]
)

In [None]:
# how to get list
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import concat_ws
# get array type columns
arr_col_lst = [field.name for field in df.schema.fields if isinstance(field.dataType, ArrayType)]
# concat_ws to array type columns
for arr_col in arr_col_lst:
    df = df.withColumn(arr_col, concat_ws(",", arr_col))

In [None]:
# left outer join df & df2
df = df.join(df2, df.id == df.root_id, "left_outer") # id를 비교
df = df.drop("root_id") # 불필요해진 root_id drop

In [None]:
# upload to redshift
# set vars
jdbc_url = jdbc_url
temp_dir = temp_dir
db_table = db_table
# df를 redshift에 적재한다.
df.write \
  .format("io.github.spark_redshift_community.spark.redshift") \  # df.write의 format 설정
  .option("driver", "com.amazon.redshift.jdbc42.Driver") \        # df.write의 driver 설정
  .option("forward_spark_s3_credentials", True) \                 # df.write의 forward_spark_s3_credentials 설정 # IAM_ROLE이 있다면 IAM_ROLE을 사용
  .option("url", jdbc_url) \
  .option("dbtable", db_table) \
  .option("tempdir", temp_dir) \
  .mode("overwrite") \
  .save()