In [35]:
%pip install --upgrade pip sagemaker

[0mNote: you may need to restart the kernel to use updated packages.


In [36]:
# Restart kernel to get the packages
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [3]:
import time
import os
import json
import botocore
import boto3
import numpy as np  
import pandas as pd 
import sagemaker

sagemaker.__version__

'2.168.0'

In [4]:
import logging
import boto3
from botocore.exceptions import ClientError


def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [5]:
bucket_name = "sagemaker-zara-blouses-generation"

In [None]:
create_bucket(bucket_name, region='eu-central-1')

In [7]:
# Get some variables you need to interact with SageMaker service
boto_session = boto3.Session()
region = boto_session.region_name
bucket_prefix = "loading_dataset"  
sm_session = sagemaker.Session()
sm_client = boto_session.client("sagemaker")
sm_role = sagemaker.get_execution_role()

initialized = True

In [8]:
# Store some variables to keep the value between the notebooks
%store bucket_name
%store bucket_prefix
%store sm_role
%store region
%store initialized

Stored 'bucket_name' (str)
Stored 'bucket_prefix' (str)
Stored 'sm_role' (str)
Stored 'region' (str)
Stored 'initialized' (bool)


In [9]:
NOTEBOOK_METADATA_FILE = "/opt/ml/metadata/resource-metadata.json"
domain_id = None

if os.path.exists(NOTEBOOK_METADATA_FILE):
    with open(NOTEBOOK_METADATA_FILE, "rb") as f:
        domain_id = json.loads(f.read()).get('DomainId')
        print(f"SageMaker domain id: {domain_id}")

%store domain_id

SageMaker domain id: d-ivd5gnez0yil
Stored 'domain_id' (str)


In [10]:
framework_version = "0.23-1"
processing_instance_type = "ml.m5.large"
processing_instance_count = 1

In [11]:
%store -r 

%store

try:
    initialized
except NameError:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN 00-start-here notebook   ")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++")

Stored variables and their in-db values:
baseline_s3_url                          -> 's3://sagemaker-eu-central-1-567821811420/from-ide
bucket_name                              -> 'sagemaker-zara-blouses-generation'
bucket_prefix                            -> 'loading_dataset'
customers_count                          -> 10000
customers_feature_group_name             -> 'fscw-customers-06-21-08-24'
domain_id                                -> 'd-ivd5gnez0yil'
evaluation_s3_url                        -> 's3://sagemaker-eu-central-1-567821811420/from-ide
experiment_name                          -> 'from-idea-to-prod-experiment-19-09-21-01'
initialized                              -> True
input_s3_url                             -> 's3://sagemaker-eu-central-1-567821811420/from-ide
model_package_group_name                 -> 'from-idea-to-prod-model-group'
orders_count                             -> 100000
orders_feature_group_name                -> 'fscw-orders-06-21-08-24'
output_s3_url  

In [12]:
session = sagemaker.Session()
sm = session.sagemaker_client

In [13]:
print(bucket_name,bucket_prefix)

sagemaker-zara-blouses-generation loading_dataset


In [None]:
# Define where the training job stores the model artifact
zara_images_s3_url = f"s3://{bucket_name}/{bucket_prefix}/zara_images_initial"
print(zara_images_s3_url)
%store zara_images_s3_url

In [16]:
%%writefile dataset_loading.py
import sys
import subprocess
import requests
import urllib.parse
import json

subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-r",
    "/opt/ml/processing/input/requirements.txt",
])

from scrapy.http import TextResponse
from scrapy import Spider

API_KEY = '92744398-93ae-4cbc-ab11-f69bc621ba2e'

def get_scrapeops_url(url):
    payload = {'api_key': API_KEY, 'url': url}
    proxy_url = 'https://proxy.scrapeops.io/v1/?' + urllib.parse.urlencode(payload)
    return proxy_url

page = 1

while True:
  url = 'https://www.zara.com/es/es/mujer-camisas-l1217.html?v1=2184370&page='+str(page)
  r = requests.get(get_scrapeops_url(url))
  resp = TextResponse(body=r.content, url=url)
  data = resp.css("script[type='application/ld+json']::text").get()
  elements = json.loads(data)['itemListElement']

  for idx, item in enumerate(elements):
        
    if page == 6 and idx == 8:
      continue
        
    image_name = item['name']
    image_url = item['image']
    try:
      img_data = requests.get(image_url).content
      img_name = '/opt/ml/processing/output/'+'page_'+str(page)+'_idx_'+str(idx)+'_name_'+image_name+'.jpg'
      with open(img_name, 'wb') as handler:
        handler.write(img_data)
    except:
      print(image_url)
  if len(elements) > 0:
    page += 1
  else:
    break

Writing dataset_loading.py


In [55]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processing_inputs = [
    ProcessingInput(source=input_s3_url, destination="/opt/ml/processing/input"),
]

processing_outputs = [
        ProcessingOutput( 
            source="/opt/ml/processing/output",
            destination=zara_images_s3_url,
        ),
    ]

In [23]:
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=sm_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count, 
    base_job_name='loading-dataset',
    sagemaker_session=session,
)

In [24]:
from time import gmtime, strftime, sleep
from sagemaker.experiments.run import Run, load_run
experiment_name = f"loading-dataset-{strftime('%d-%H-%M-%S', gmtime())}"
run_suffix = strftime('%Y-%m-%M-%S', gmtime())
run_name = f"container-processing-{run_suffix}"

with Run(experiment_name=experiment_name,
         run_name=run_name,
         run_display_name="container-processing",
         sagemaker_session=session
        ) as run:
    run.log_parameters(
        {}
    )
   
    experiment_config = run.experiment_config

In [None]:

import botocore
try:
    sklearn_processor.run(
        inputs=processing_inputs,
        outputs=processing_outputs,
        code='dataset_loading.py',
        wait=True,
        experiment_config=experiment_config,
        # arguments = ['arg1', 'arg2'],
    )
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e

In [None]:
# If you set wait to False in the previous code cell, wait until the job completes
while sm.describe_processing_job(
        ProcessingJobName=sklearn_processor._current_job_name
    )["ProcessingJobStatus"] != "Completed":
    time.sleep(10)
    print(f"Wait until {sklearn_processor._current_job_name} completed")