# CSV -> JSON

#### 이번 Lab에서는 Glue Job의 기본 Template을 살펴보고 CSV 파일을 JSON으로 변환하는 Glue Job을 만들어 실행하고 디버깅 하는 과정을 살펴봅니다.

#### S3에 업로드한 데이터를 읽어오기 위해 각자 S3 bucket에 지정한 account-id를 account_id 변수에 할당합니다.

In [None]:
ACCOUNT_ID = ''

#### File Read & Write with Spark API
#### 아래 코드 실행 후 s3://aws-glue-hol-[account id]/output 디렉토리에 json 파일이 정상적으로 생성되었는 지 확인합니다.

In [None]:
import sys

from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_extract, col

# GlueContext 생성
glueContext = GlueContext(sc)

s3_bucket = 's3://aws-glue-hol-' + ACCOUNT_ID

# Read CSV file using Spark API
titanic_csv_df = spark.read.csv(s3_bucket + '/train', header=True)
# Create initial Column using Spark API
titanic_csv_df = titanic_csv_df.withColumn('initial', regexp_extract(col('Name'), "(\w+)\.", 1))
# Drop Name Column using Glue API
titanic_csv_dyf = DynamicFrame.fromDF(titanic_csv_df, glueContext, 'titanic_csv_dyf').drop_fields('Name')

# Write JSON file using Spark API
titanic_csv_dyf.toDF().write \
    .format('json') \
    .mode('overwrite') \
    .save(s3_bucket + '/output')

#### File Read & Write with Glue API

In [None]:
import sys

from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_extract, col

# GlueContext 생성
glueContext = GlueContext(sc)

s3_bucket = 's3://aws-glue-hol-' + ACCOUNT_ID

# Read CSV file using Glue API
titanic_dyf = glueContext.create_dynamic_frame.from_catalog(database='analytics_hol',
                                                           table_name='titanic_train',                           
                                                           transformation_ctx='titanic_dyf')

# Create initial Column using Spark API
titanic_csv_df = titanic_dyf.toDF()
titanic_csv_df = titanic_csv_df.withColumn('initial', regexp_extract(col('Name'), "(\w+)\.", 1))

# Drop Name Column using Glue API
titanic_csv_dyf = DynamicFrame.fromDF(titanic_csv_df, glueContext, 'titanic_csv_dyf').drop_fields('Name')

# Write JSON file using Glue API
glueContext.write_dynamic_frame.from_options(
    frame=titanic_csv_dyf, 
    connection_type = "s3", 
    connection_options = {"path": s3_bucket + '/output'}, 
    format = "json", 
    transformation_ctx = "titanic_json_dyf")

#### 실제 Glue Job을 실습하기 위한 코드입니다.
#### Lab Guide를 따라 Glue Console에서 Job을 만들고 아래 코드를 Copy해서 Job을 실행합니다.

In [None]:
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_extract, col

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

# SparkContext 생성
sc = SparkContext()
# GlueContext 생성
glueContext = GlueContext(sc)
# SparkSession 생성
spark = glueContext.spark_session
# Job 생성
job = Job(glueContext)
# Job 초기화
job.init(args['JOB_NAME'], args)

s3_bucket = 's3://aws-glue-hol-' + ACCOUNT_ID

# S3에서 csv 데이터를 읽어 DynamicFrame 생성
titanic_dyf = glueContext.create_dynamic_frame_from_options(
    connection_type = 's3',
    connection_options = {'paths': [s3_bucket + '/train']},
    format='csv',
    format_options={
        "withHeader": True,
        "delimiter": ','
    })

# Spark 활용: DynamicFrame을 DataFrame으로 변환 및 initail column을 추가
titanic_csv_df = titanic_dyf.toDF()
titanic_csv_df = titanic_csv_df.withColumn('initial', regexp_extract(col('Name'), "(\w+)\.", 1))

# Glue 활용: DataFrame을 DynamicFrame으로 변환하여 Name column 삭제
titanic_csv_dyf = DynamicFrame.fromDF(titanic_csv_df, glueContext, 'titanic_csv_dyf').drop_fields('Name')

# json format으로 output 디렉토리에 저장
titanic_csv_dyf.toDF().write \
    .format('json') \
    .mode('overwrite') \
    .save(s3_bucket + '/output')
# Job commit
job.commit()