In [5]:
%pip install --upgrade pip sagemaker

Collecting PyYAML==6.0 (from sagemaker)
  Using cached PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Installing collected packages: PyYAML
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 5.3
[31mERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# Restart kernel to get the packages
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
import time
import os
import json
import botocore
import boto3
import numpy as np  
import pandas as pd 
import sagemaker

sagemaker.__version__

In [2]:
import logging
import boto3
from botocore.exceptions import ClientError


def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [7]:
bucket_name = "sagemaker-zara-blouses-generation"

In [8]:
create_bucket(bucket_name, region='eu-central-1')

True

In [9]:
# Get some variables you need to interact with SageMaker service
boto_session = boto3.Session()
region = boto_session.region_name
bucket_prefix = "loading_dataset"  
sm_session = sagemaker.Session()
sm_client = boto_session.client("sagemaker")
sm_role = sagemaker.get_execution_role()

initialized = True

In [10]:
# Store some variables to keep the value between the notebooks
%store bucket_name
%store bucket_prefix
%store sm_role
%store region
%store initialized

Stored 'bucket_name' (str)
Stored 'bucket_prefix' (str)
Stored 'sm_role' (str)
Stored 'region' (str)
Stored 'initialized' (bool)


In [11]:
NOTEBOOK_METADATA_FILE = "/opt/ml/metadata/resource-metadata.json"
domain_id = None

if os.path.exists(NOTEBOOK_METADATA_FILE):
    with open(NOTEBOOK_METADATA_FILE, "rb") as f:
        domain_id = json.loads(f.read()).get('DomainId')
        print(f"SageMaker domain id: {domain_id}")

%store domain_id

SageMaker domain id: d-ivd5gnez0yil
Stored 'domain_id' (str)


In [12]:
framework_version = "0.23-1"
processing_instance_type = "ml.m5.large"
processing_instance_count = 1

In [13]:
%store -r 

%store

try:
    initialized
except NameError:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN 00-start-here notebook   ")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++")

Stored variables and their in-db values:
bucket_name                    -> 'sagemaker-zara-blouses-generation'
bucket_prefix                  -> 'loading_dataset'
domain_id                      -> 'd-ivd5gnez0yil'
initialized                    -> True
input_s3_url                   -> 's3://sagemaker-eu-central-1-567821811420/loading_
region                         -> 'eu-central-1'
sm_role                        -> 'arn:aws:iam::567821811420:role/service-role/Amazo
zara_images_s3_url             -> 's3://sagemaker-eu-central-1-567821811420/loading_


In [14]:
session = sagemaker.Session()
sm = session.sagemaker_client

In [52]:
# Define where the training job stores the model artifact
zara_images_s3_url = f"s3://{bucket_name}/{bucket_prefix}/zara_images"
print(zara_images_s3_url)
%store zara_images_s3_url

s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images
Stored 'zara_images_s3_url' (str)


In [56]:
%%writefile dataset_loading.py
import sys
import subprocess
import requests
import urllib.parse
import json

subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-r",
    "/opt/ml/processing/input/requirements.txt",
])

from scrapy.http import TextResponse
from scrapy import Spider

API_KEY = '92744398-93ae-4cbc-ab11-f69bc621ba2e'

def get_scrapeops_url(url):
    payload = {'api_key': API_KEY, 'url': url}
    proxy_url = 'https://proxy.scrapeops.io/v1/?' + urllib.parse.urlencode(payload)
    return proxy_url

page = 1

while True:
  url = 'https://www.zara.com/es/es/mujer-camisas-l1217.html?v1=2184370&page='+str(page)
  r = requests.get(get_scrapeops_url(url))
  resp = TextResponse(body=r.content, url=url)
  data = resp.css("script[type='application/ld+json']::text").get()
  elements = json.loads(data)['itemListElement']

  for idx, item in enumerate(elements):
        
    if page == 6 and idx == 8:
      continue
        
    image_name = item['name']
    image_url = item['image']
    try:
      img_data = requests.get(image_url).content
      img_name = '/opt/ml/processing/output/'+'page_'+str(page)+'_idx_'+str(idx)+'_name_'+image_name+'.jpg'
      with open(img_name, 'wb') as handler:
        handler.write(img_data)
    except:
      print(image_url)
  if len(elements) > 0:
    page += 1
  else:
    break

Overwriting dataset_loading.py


In [19]:
input_s3_url = session.upload_data(
    path="requirements.txt",
    bucket=bucket_name,
    key_prefix=f"{bucket_prefix}/input"
)

%store input_s3_url

Stored 'input_s3_url' (str)


In [54]:
!aws s3 ls {bucket_name}/{bucket_prefix} --recursive

2023-06-20 10:20:02          6 loading_dataset/input/requirements.txt


In [21]:
print(input_s3_url)

s3://sagemaker-zara-blouses-generation/loading_dataset/input/requirements.txt


In [55]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processing_inputs = [
    ProcessingInput(source=input_s3_url, destination="/opt/ml/processing/input"),
]

processing_outputs = [
        ProcessingOutput( 
            source="/opt/ml/processing/output",
            destination=zara_images_s3_url,
        ),
    ]

In [23]:
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=sm_role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count, 
    base_job_name='loading-dataset',
    sagemaker_session=session,
)

In [24]:
from time import gmtime, strftime, sleep
from sagemaker.experiments.run import Run, load_run
experiment_name = f"loading-dataset-{strftime('%d-%H-%M-%S', gmtime())}"
run_suffix = strftime('%Y-%m-%M-%S', gmtime())
run_name = f"container-processing-{run_suffix}"

with Run(experiment_name=experiment_name,
         run_name=run_name,
         run_display_name="container-processing",
         sagemaker_session=session
        ) as run:
    run.log_parameters(
        {}
    )
   
    experiment_config = run.experiment_config

In [57]:

import botocore
try:
    sklearn_processor.run(
        inputs=processing_inputs,
        outputs=processing_outputs,
        code='dataset_loading.py',
        wait=True,
        experiment_config=experiment_config,
        # arguments = ['arg1', 'arg2'],
    )
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'AccessDeniedException':
        print(f"Ignore AccessDeniedException: {e.response['Error']['Message']} because of the slow resource tag auto propagation")
    else:
        raise e

..........................[34mCollecting scrapy
  Downloading Scrapy-2.9.0-py2.py3-none-any.whl (277 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 277.2/277.2 kB 25.9 MB/s eta 0:00:00[0m
[34mCollecting itemloaders>=1.0.1
  Downloading itemloaders-1.1.0-py3-none-any.whl (11 kB)[0m
[34mCollecting parsel>=1.5.0
  Downloading parsel-1.8.1-py2.py3-none-any.whl (17 kB)[0m
[34mCollecting itemadapter>=0.1.0
  Downloading itemadapter-0.8.0-py3-none-any.whl (11 kB)[0m
[34mCollecting service-identity>=18.1.0
  Downloading service_identity-21.1.0-py2.py3-none-any.whl (12 kB)[0m
[34mCollecting queuelib>=1.4.2
  Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)[0m
[34mCollecting lxml>=4.3.0
  Downloading lxml-4.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 96.9 MB/s eta 0:00:00[0m
[34mCollecting cssselect>=0.9.1
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)[0m


UnexpectedStatusException: Error for Processing job loading-dataset-2023-06-20-11-23-54-240: Failed. Reason: AlgorithmError: See job logs for more information

In [48]:
# list the uploaded files
!aws s3 rm s3://{bucket_name}/{bucket_prefix}/zara_images/ --recursive

delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_1_idx_2_name_BLUSA LARGA BORDADA.jpg
delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_1_idx_1_name_BLUSA ALGODÓN BORDADOS.jpg
delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_1_idx_3_name_BLUSA LINO.jpg
delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_1_idx_4_name_CAMISA 100% LINO.jpg
delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_1_idx_0_name_CAMISA SATINADA VOLANTES..jpg
delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_1_idx_7_name_CAMISA LINO BAJO ASIMÉTRICO.jpg
delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_2_idx_5_name_CAMISA SATINADA.jpg
delete: s3://sagemaker-zara-blouses-generation/loading_dataset/zara_images/page_2_idx_0_name_CAMISA LINO ASIMÉTRICA.jpg
delete: s3://sagemaker-zara-blouses-generation/loading_datas

In [59]:
!aws s3 ls s3://{bucket_name}/{bucket_prefix}/ --recursive

2023-06-20 10:20:02          6 loading_dataset/input/requirements.txt
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_0_name_CAMISA SATINADA VOLANTES..jpg
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_1_name_BLUSA ALGODÓN BORDADOS.jpg
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_2_name_BLUSA LARGA BORDADA.jpg
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_3_name_BLUSA LINO.jpg
2023-06-20 11:30:01        395 loading_dataset/zara_images/page_1_idx_4_name_CAMISA 100% LINO.jpg
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_5_name_BLUSA ESTAMPADA BRILLO.jpg
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_6_name_CAMISA SATINADA VOLANTES..jpg
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_7_name_CAMISA LINO BAJO ASIMÉTRICO.jpg
2023-06-20 11:30:01        394 loading_dataset/zara_images/page_1_idx_8_name_BLUSA BORDADOS PERFORADOS.jpg
2

In [None]:
# If you set wait to False in the previous code cell, wait until the job completes
while sm.describe_processing_job(
        ProcessingJobName=sklearn_processor._current_job_name
    )["ProcessingJobStatus"] != "Completed":
    time.sleep(10)
    print(f"Wait until {sklearn_processor._current_job_name} completed")