In [None]:
import boto3
import io
import pandas as pd
from sagemaker import get_execution_role
role = get_execution_role()

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

region = boto3.session.Session().region_name
role = get_execution_role()

sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.large", instance_count=5
)

In [None]:
import pandas as pd

input_data = "s3://sagemaker-kproject/data/KR7000210005.csv".format(region)
df = pd.read_csv(input_data)
df

Unnamed: 0,TRD_DD,ISU_CD,ISU_NM,TDD_CLSPRC,TDD_OPNPRC,TDD_HGPRC,TDD_LWPRC,MKTCAP,ACC_TRDVOL,EPS,PER,BPS,PBR,DPS,DVD_YLD
0,2021/12/27,210,DL,63000,63900,64400,62600,1320220692000,54725,13077,4.82,67178,0.94,1300,2.06
1,2021/12/24,210,DL,63900,63700,64400,63500,1339080987600,36855,13077,4.89,67178,0.95,1300,2.03
2,2021/12/23,210,DL,63700,63300,63700,63000,1334889810800,25141,13077,4.87,67178,0.95,1300,2.04
3,2021/12/22,210,DL,63500,62900,63900,62800,1330698634000,37836,13077,4.86,67178,0.95,1300,2.05
4,2021/12/21,210,DL,62800,63800,63900,62200,1316029515200,46287,13077,4.80,67178,0.93,1300,2.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6747,1995/05/08,210,대림산업,17200,17300,17300,17000,379794318000,13190,-,-,-,-,-,-
6748,1995/05/06,210,대림산업,17300,17700,17700,17300,382002424500,9560,-,-,-,-,-,-
6749,1995/05/04,210,대림산업,17600,18100,18100,17600,388626744000,25040,-,-,-,-,-,-
6750,1995/05/03,210,대림산업,18100,18200,18300,18000,399667276500,46070,-,-,-,-,-,-


In [None]:
df.shape

(6752, 15)

In [None]:
%%writefile preprocessing.py

import argparse
import os
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.exceptions import DataConversionWarning


warnings.filterwarnings(action='ignore', category=DataConversionWarning)


def print_shape(df):
    print('Data shape: {}'.format(df.shape))

        
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-test-split-ratio", type=float, default=0.3)
    args, _ = parser.parse_known_args()

    
    print("Received arguments {}".format(args))        
    input_data_path = os.path.join("/opt/ml/processing/input", "KR7000210005.csv")

    print("Reading input data from {}".format(input_data_path))
    df = pd.read_csv(input_data_path)
    df = pd.DataFrame(data=df)    
    
    
    print(df)
          
    print("data shape before preprocessing: {}".format(df.shape))

    # 날짜, 시가총액 열만 가져오기
    df = df[['TRD_DD','MKTCAP']]
    
    
    print("data shape after preprocessing: {}".format(df.shape))
    
    
    train_output_path = os.path.join('/opt/ml/processing/train', 'train.csv')

    test_output_path = os.path.join('/opt/ml/processing/test', 'test.csv')

    print('Saving train data {}'.format(train_output_path))

    pd.DataFrame(df).to_csv( train_output_path, index=False)

    print('Saving test data {}'.format(test_output_path))
    pd.DataFrame(df).to_csv(test_output_path, index=False)



Overwriting preprocessing.py


In [None]:

sklearn_processor.run(
    code="preprocessing.py",
    inputs=[ProcessingInput(source='s3://sagemaker-kproject/data/KR7000210005.csv', 
                            destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
    ]
    
)


Job Name:  sagemaker-scikit-learn-2022-01-03-16-10-23-559
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-kproject/data/KR7000210005.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-268367265700/sagemaker-scikit-learn-2022-01-03-16-10-23-559/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ap-northeast-2-268367265700/sagemaker-scikit-learn-2022-01-03-16-10-23-559/output/train_data', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test_data', 

In [None]:
# 결과 확인

test_data = "s3://sagemaker-ap-northeast-2-268367265700/sagemaker-scikit-learn-2022-01-03-16-10-23-559/output/test_data/test.csv".format(region)
test = pd.read_csv(test_data)
test


Unnamed: 0,TRD_DD,MKTCAP
0,2021/12/27,1320220692000
1,2021/12/24,1339080987600
2,2021/12/23,1334889810800
3,2021/12/22,1330698634000
4,2021/12/21,1316029515200
...,...,...
6747,1995/05/08,379794318000
6748,1995/05/06,382002424500
6749,1995/05/04,388626744000
6750,1995/05/03,399667276500


In [None]:
train_data = "s3://sagemaker-ap-northeast-2-268367265700/sagemaker-scikit-learn-2022-01-03-16-10-23-559/output/train_data/train.csv".format(region)
train = pd.read_csv(train_data)
train

Unnamed: 0,TRD_DD,MKTCAP
0,2021/12/27,1320220692000
1,2021/12/24,1339080987600
2,2021/12/23,1334889810800
3,2021/12/22,1330698634000
4,2021/12/21,1316029515200
...,...,...
6747,1995/05/08,379794318000
6748,1995/05/06,382002424500
6749,1995/05/04,388626744000
6750,1995/05/03,399667276500
