# Python Shell

#### 지금까지는 Glue 작업 타입 중 Spark 작업을 살펴보았고 이번 Lab에서는 Python Shell 작업을 생성하고 실행하는 과정을 살펴봅니다.

#### CSV -> JSON

#### RDS의 Table을 Reset하기 위해 HOST, PASSWD 정보를 입력하고 아래 코드 Cell을 실행합니다.

In [None]:
HOST = ''
USER = 'admin'
PASSWD = ''
DATABASE = 'analytics_hol'
TABLE = 'titanic_train'

In [None]:
import pymysql

conn = pymysql.connect(host=HOST, user=USER, passwd=PASSWD, connect_timeout=5)   

try:
    with conn.cursor() as cursor:
        # drop db
        query = 'DROP DATABASE IF EXISTS {DATABASE}'.format(DATABASE=DATABASE)
        cursor.execute(query)
        
        # create db
        query = 'CREATE DATABASE IF NOT EXISTS {DATABASE}'.format(DATABASE=DATABASE)
        cursor.execute(query)
        
        # create table
        query = '''
CREATE TABLE IF NOT EXISTS {DATABASE}.{TABLE} (
  `passengerid` int(11) NOT NULL,
  `survived` tinyint(1) DEFAULT NULL,
  `pclass` tinyint(4) DEFAULT NULL,
  `name` varchar(128) COLLATE utf8_bin DEFAULT NULL,
  `sex` char(8) DEFAULT NULL,
  `age` tinyint(4) DEFAULT NULL,
  `sibsp` tinyint(4) DEFAULT NULL,
  `parch` tinyint(4) DEFAULT NULL,
  `ticket` varchar(32) COLLATE utf8_bin DEFAULT NULL,
  `fare` DECIMAL(10, 6) DEFAULT NULL,
  `cabin` varchar(32) COLLATE utf8_bin DEFAULT NULL,
  `embarked` char(1) DEFAULT NULL,
  `created_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  `updated_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  PRIMARY KEY (`passengerid`),
  KEY `survived` (`survived`),
  KEY `pclass` (`pclass`),
  KEY `sex` (`sex`),
  KEY `embarked` (`embarked`),
  KEY `created_time` (`created_time`),
  KEY `updated_time` (`updated_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin
        '''.format(DATABASE=DATABASE, TABLE=TABLE)
        cursor.execute(query)

        conn.commit()
except Exception as e:
    print('[ERROR]: {}'.format(e))
    raise
finally:
    conn.close()

#### Write CSV file to S3(JSON) and JDBC

#### ACCOUNT_ID, HOST, USER, PASSWD, DATABASE, TABLE 정보를 입력하고 아래 코드 Cell을 복사하여 Glue Python Shell Job에 붙여넣습니다.
#### Glue Python Shell Job을 만드는 방법은 Lab 가이드를 참고합니다.

In [None]:
import csv
import json
import boto3
import io
import pymysql

ACCOUNT_ID = ''
HOST = ''
USER = 'admin'
PASSWD = ''
DATABASE = 'analytics_hol'
TABLE = 'titanic_train'

try:
    # Read CSV S3 file
    s3_client = boto3.client('s3')
    obj = s3_client.get_object(Bucket='aws-glue-hol-' + ACCOUNT_ID, Key='train/titanic_train.csv')
    data = obj['Body'].read()

    # Write CSV to Buffer
    columns = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
    csv_dict_reader = csv.DictReader(io.BytesIO(data), columns)
    next(csv_dict_reader, None)
    json_buffer = io.BytesIO()

    for row in csv_dict_reader:
        json.dump(row, json_buffer)
        json_buffer.write('\n')

    # Write Buffer to JSON file
    s3_resource = boto3.resource('s3')
    s3_resource.Object('aws-glue-hol-' + ACCOUNT_ID, 'output/titanic_train.json').put(Body=json_buffer.getvalue())

    # Write CSV to JDBC
    csv_reader = csv.reader(io.BytesIO(data), delimiter=',')
    next(csv_reader, None)
    conn = pymysql.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, connect_timeout=5)

    with conn.cursor() as cursor:
        for row in csv_reader:
            for n, i in enumerate(row):
                if i == '' and n in [1, 2, 5, 6, 7, 8, 9]:
                    row[n] = 0
            passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked = row
            name = name.replace('"', "'")
            query = '''
insert into {DATABASE}.{TABLE}(passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) 
values({passengerid}, {survived}, {pclass}, "{name}", '{sex}', {age}, {sibsp}, {parch}, '{ticket}', {fare}, '{cabin}', '{embarked}')
on duplicate key update passengerid={passengerid}, survived={survived}, pclass={pclass}, name="{name}", sex='{sex}', age={age}, sibsp={sibsp}, parch={parch}, ticket='{ticket}', fare={fare}, cabin='{cabin}', embarked='{embarked}'
            '''.format(
                DATABASE=DATABASE,
                TABLE=TABLE,
                passengerid=passengerid, 
                survived=survived, 
                pclass=pclass, 
                name=name, 
                sex=sex, 
                age=age, 
                sibsp=sibsp, 
                parch=parch, 
                ticket=ticket, 
                fare=fare, 
                cabin=cabin, 
                embarked=embarked)
            cursor.execute(query)
        conn.commit()
except Exception as e:
    raise
finally:
    conn.close()


#### JDBC Write 결과 확인

In [None]:
from awsglue.context import GlueContext

# GlueContext 생성
glueContext = GlueContext(sc)

# Read Data from Glue Catalog(JDBC)
titanic_dyf = glueContext.create_dynamic_frame.from_catalog(database=DATABASE,
                                                           table_name='_'.join([DATABASE, TABLE]),                           
                                                           transformation_ctx='titanic_dyf',
                                                           additional_options={'hashexpression': 'passengerid', 
                                                                               'hashpartitions': 5})

titanic_dyf.toDF().show()