# Load label

#### 이번 Lab에서는 Glue Job을 이용해 Database에 접근하고 Read, Write하는 예제를 살펴봅니다.

#### 1. ACCOUNT_ID와 RDS의 HOST, USER, PASSWD 정보를 입력합니다.

In [6]:
ACCOUNT_ID = ''
HOST = ''
USER = ''
PASSWD = ''

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
RDS_DATABASE = 'hol'
RDS_TABLE = 'label'
GLUE_DATABASE = 'analytics_hol_' + ACCOUNT_ID
GLUE_TABLE = 's3_label'
JDBC_URL = 'jdbc:mysql://{HOST}:3306/{DATABASE}'.format(HOST=HOST, DATABASE=RDS_DATABASE)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### 2. Lab을 진행하기 앞서 필요한 Database와 Table을 생성합니다.

In [8]:
import pymysql

conn = pymysql.connect(host=HOST, user=USER, passwd=PASSWD, connect_timeout=5)   

try:
    with conn.cursor() as cursor:
        # drop db
        query = 'DROP DATABASE IF EXISTS {DATABASE}'.format(DATABASE=RDS_DATABASE)
        cursor.execute(query)
        
        # create db
        query = 'CREATE DATABASE IF NOT EXISTS {DATABASE}'.format(DATABASE=RDS_DATABASE)
        cursor.execute(query)
        
        # create table
        query = '''
CREATE TABLE IF NOT EXISTS {DATABASE}.{TABLE} (
  `passengerid` int(11) NOT NULL,
  `label` tinyint(1) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin
        '''.format(DATABASE=RDS_DATABASE, TABLE=RDS_TABLE)
        cursor.execute(query)
        conn.commit()
except Exception as e:
    print('[ERROR]: {}'.format(e))
    raise
finally:
    conn.close()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1
1
0

#### 3. JDBC Write

In [9]:
# read from s3
s3_bucket = 's3://analytics-hol-' + ACCOUNT_ID
label_df = spark.read.csv(s3_bucket + '/label', header=True)

# wrtie using jdbc
connectionProperties = {    
    "user" : USER,
    "password" : PASSWD,
    "driver" : "com.mysql.jdbc.Driver"
}

label_df.write.jdbc(
    url=JDBC_URL, 
    table=RDS_TABLE, 
    mode="overwrite", 
    properties=connectionProperties
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### 4. JDBC Read

In [10]:
from awsglue.context import GlueContext

# JDBC 접속 정보
connectionProperties = {
    "user" : USER,
    "password" : PASSWD,
    "driver" : "com.mysql.jdbc.Driver",
    "fetchsize" : "1000"
}

# lowerBound, upperBound, numPartitions은 Partition을 나누는 기준을 정할 때 사용되며 데이터를 읽어오는 범위와는 관계없음
label_df = spark.read.jdbc(
        url=JDBC_URL,
        table=RDS_TABLE,
        column="passengerid",
        lowerBound=1,
        upperBound=1000,
        numPartitions=5,
        properties=connectionProperties)

label_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+-----+
|PassengerId|Label|
+-----------+-----+
|          1|    0|
|          2|    1|
|          3|    1|
|          4|    1|
|          5|    0|
|          6|    0|
|          7|    0|
|          8|    0|
|          9|    1|
|         10|    1|
|         11|    1|
|         12|    1|
|         13|    0|
|         14|    0|
|         15|    0|
|         16|    1|
|         17|    0|
|         18|    1|
|         19|    0|
|         20|    1|
+-----------+-----+
only showing top 20 rows