# Load label

#### 이번 Lab에서는 Glue Job을 이용해 Database에 접근하고 Read, Write하는 예제를 살펴봅니다.

#### 1. S3 버킷을 생설할 때 입력한 S3_BUCKET_POSTFIX와 RDS의 HOST, USER, PASSWD 정보를 입력합니다.

In [None]:
S3_BUCKET_POSTFIX = ''
HOST = ''
USER = ''
PASSWD = ''

In [None]:
RDS_DATABASE = 'hol'
RDS_TABLE = 'label'
JDBC_URL = 'jdbc:mysql://{HOST}:3306/{DATABASE}'.format(HOST=HOST, DATABASE=RDS_DATABASE)

#### 2. Lab을 진행하기 앞서 필요한 Database와 Table을 생성합니다.

In [None]:
import pymysql

conn = pymysql.connect(host=HOST, user=USER, passwd=PASSWD, connect_timeout=5)   

try:
    with conn.cursor() as cursor:
        # drop db
        query = 'DROP DATABASE IF EXISTS {DATABASE}'.format(DATABASE=RDS_DATABASE)
        cursor.execute(query)
        
        # create db
        query = 'CREATE DATABASE IF NOT EXISTS {DATABASE}'.format(DATABASE=RDS_DATABASE)
        cursor.execute(query)
        
        # create table
        query = '''
CREATE TABLE IF NOT EXISTS {DATABASE}.{TABLE} (
  `passengerid` int(11) NOT NULL,
  `label` tinyint(1) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin
        '''.format(DATABASE=RDS_DATABASE, TABLE=RDS_TABLE)
        cursor.execute(query)
        conn.commit()
except Exception as e:
    print('[ERROR]: {}'.format(e))
    raise
finally:
    conn.close()

#### 3. JDBC Write

In [None]:
# read from s3
s3_bucket = 's3://analytics-hol-' + S3_BUCKET_POSTFIX
label_df = spark.read.csv(s3_bucket + '/label', header=True)

# wrtie using jdbc
connectionProperties = {    
    "user" : USER,
    "password" : PASSWD,
    "driver" : "com.mysql.jdbc.Driver"
}

label_df.write.jdbc(
    url=JDBC_URL, 
    table=RDS_TABLE, 
    mode="overwrite", 
    properties=connectionProperties
)

#### 4. JDBC Read

In [None]:
from awsglue.context import GlueContext

# read from jdbc
connectionProperties = {
    "user" : USER,
    "password" : PASSWD,
    "driver" : "com.mysql.jdbc.Driver",
    "fetchsize" : "1000"
}

label_df = spark.read.jdbc(
        url=JDBC_URL,
        table=RDS_TABLE,
        properties=connectionProperties)

label_df.show()