## LIBRARIES

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from io import BytesIO
import gzip
import json
import re
import os
import csv

## SETTINGS

In [None]:
url = 'https://www.zara.com/sitemaps/sitemap-index.xml.gz'
country = 'sitemap-es-es'
gender = 'mujer'
item_types = ['vestido'] 
keyword_script = 'detailedComposition'
output = 60 # replace with len(item_urls)-1 if you want to extract all the items of the category
dir = 'data'
file_name_items = 'itemsDimension.csv'
file_name_comp = 'compDimension.csv'
code = 100000

## FUNCTIONS

In [None]:
## REQUESTS - SOUP

def decompressRequestSoup(url):
    """Given an url perform a get request,
    decompress content and inject in soup
    object.

    Args:
    :param url: an url wich returnes compressed
    content

    Return: soup object of the response contents

    """
    r_comp = requests.get(url)
    decomp = gzip.GzipFile(fileobj=BytesIO(r_comp.content))
    soup = BeautifulSoup(decomp.read(), features="html.parser")
    return soup



def requestSoup(url):
    """Given an url perform a get request and
    set response in a soup object.

    Args:
    :param url: an url to soup, string.

    Return: soup object of get response

    """
    r = requests.get(url)
    soup = BeautifulSoup(r.content, features="html.parser")
    return soup


## ZARA SITE NAVIGATION

def getGeneralLinks(url, country, gender):
    """


    """
    """Get all general clothing urls from web sitemap

    Args:
    :param url: sitemap, string. 
    :param country: pattern string.
    :param gender: pattern string, language depends on
    country pattern.
    
    Return: list of general clothing types urls

    """
    soup = decompressRequestSoup(url)
    links = soup.findAll('loc')
    for link in links:
        href = link.text
        if re.search(country, href) != None:
            time_1 = time.time()
            es_url = href
            es_soup = decompressRequestSoup(es_url)
            es_links = es_soup.findAll('loc')
            es_links_mujer = [link.text for link in es_links
                              if re.search(gender, link.text) != None]
            time_2 = time.time()
            resp_time = time_2 - time_1
            time.sleep(resp_time+1)

    return es_links_mujer


def selectGeneralType(url, country, gender, item_types):
    """Get general types of clothing filtered url list from
    web sitemap.

    Args:
    ;url: (sitemap)
    ;country: (pattern string)
    ;gender: (pattern string, language depends on
    coutry pattern)
    ;item_types: (list of types of clothing to
    filter).

    Returns: list of general clothing sections urls.

    """
    link_list = getGeneralLinks(url, country, gender)
    pattern = '|'.join(item_types)
    type_links = [link for link in link_list
                  if re.search(pattern, link) != None]
    return type_links



def getItemUrl(url, country, gender, item_types):
    """Given a list of general cloth sections it
     gets the url of each item in each.

    Args:
    :param url: (sitemap)
    :param country: (pattern string)
    :param gender: (pattern string, language depends on
    :param coutry pattern)
    :param item_types: (list of types of clothing to
    filter).

    Retturns: list of items urls.

    """
    gen_urls = selectGeneralType(url, country, gender, item_types)
    for gen_url in gen_urls:
        time_1 = time.time()
        gen_soup = requestSoup(gen_url)
        #gen_soup
        time_2 = time.time()
        resp_time = time_2 - time_1
        item_cont = gen_soup.find_all('a', class_="name _item")
        item_urls = [cont.get('href') + '#' for cont in item_cont]  # adding hash to access app content page
        time.sleep(resp_time + 2)

        return item_urls


## DATAFRAME FILLING

def fillDataFrames(url, country, gender, item_types, keyword_script, output, code, df_items, df_comp):
    """Given the list of items urls, extract key features (price, composition,
    description) and build a dataset
    :param url: (sitemap)
    :param country: (pattern string)
    :param gender: (pattern string, language depends on
    :param coutry pattern)
    :param item_types: (list of types of clothing to
    filter).
    :param keyword_script: string to match right script
    :param output: limit of items that we want to get
    :param code: integer we'll use to add to our index to create item_codes
    :param df_items: name of items dataframe that we defined
    :param df_comp: name of composition dataframe that we defined
    :return: a dataset with key informaition of every item (price, composition,
    description)
    """
    df_items = pd.DataFrame(columns=['item_code',
                                     'item_name',
                                     'item_desc',
                                     # 'item_composition_ext',
                                     # 'item_composition_int',
                                     'item_price'])
    item_urls = getItemUrl(url, country, gender, item_types)
    
    i = 0
    while i < output:
        time_1 = time.time()
        item_url = item_urls[i]
        item_code = code + i
        i += 1
        item_script_all = requestSoup(item_url).find_all('script',
                                                            type="text/javascript") 
        time_2 = time.time()
        resp_time = time_2 - time_1
        item_script = [scrpt.text for scrpt in item_script_all if re.search(keyword_script, str(scrpt)) != None]
        time.sleep(resp_time + 7)
        for item_app in item_script:
            item_info = item_app[item_app.find(';window.zara.dataLayer ='):].replace(
                    ';window.zara.viewPayload = window.zara.dataLayer;', '').replace(';window.zara.dataLayer =', '').replace('á','a').replace(
                        'é','e').replace('í','i').replace('ó','o').replace('ú','u')
            try:
                parsed = json.loads(item_info)

                # Dataset items
                ##name
                name = parsed['product']['name']

                ## Descripció
                desc = '"'+parsed['product']['detail']['rawDescription']+'"'
                #print(desc)

                ## Price
                price = parsed['product']['detail']['colors'][0]['price']
                
                # Join Life
                try:
                  joinlife_title = parsed['product']['detail']['extraInfo']['joinLifeExtraInfo']['title']
                  joinlife_desc = '"'+parsed['product']['detail']['extraInfo']['joinLifeExtraInfo']['description']+'"'
                  join_life = True
                except KeyError:
                  joinlife_title = ""
                  joinlife_desc = ""
                  join_life = False

                new_row = {'item_code': item_code,
                            'item_name': name,
                            'item_desc': desc.replace('\n',' '),
                            'join_life': join_life,
                            'joinlife_title': joinlife_title.replace('\n',' '),
                            'joinlife_desc':joinlife_desc.replace('\n',' '),
                            'item_price': price}

                # append row to the dataframe
                df_items = df_items.append(new_row, ignore_index=True)
                
                # Dataset composition
                parts = parsed['product']['detail']['detailedComposition']['parts']
                for ele in parts:
                  part_name = ele['description']
                  compo = ele['components']

                  for mat in compo:
                    # material name
                    material = mat['material']
                    
                    # material percent
                    percent = mat['percentage']

                    new_row_comp = {'item_code': item_code,
                                        'part_name':part_name,
                                        'material':material,
                                        'percent': percent}

                    # append row to the dataframe
                    df_comp = df_comp.append(new_row_comp, ignore_index=True)

            except ValueError:
                pass


## STORAGE

def storeCSV(df,file_name):
      """Given a dataframe and a file name 
      store the dataframe in as a csv with 
      pipe (|) as separation and no
      index.

      :param df: dataset to store
      :param file_name: name of the file we want to create

      """
      df.to_csv(file_name,encoding='utf-8', sep='|',index=False)


## MAIN

In [None]:
# 1 - Initialize items and composition datasets. If you want to 
#     fill them gradually, comment these definition:

df_items = pd.DataFrame(columns=['item_code',
                                 'item_name',
                                 'item_desc',
                                 'join_life',
                                 'joinlife_title',
                                 'joinlife_desc',
                                 'item_price'])

df_comp = pd.DataFrame(columns=['item_code',
                                'part_name',
                                'material',
                                'percent'])

In [None]:
# 2 - Execute function to fill the dataframes:

fillDataframes(url, country, gender, item_types, keyword_script, output, code, 
               df_items, df_comp):

In [None]:
# 3 - Execute storeCSV function for each of the datasets to get your
#     input saved in a CSV file separated by pipe symbol (|)

storeCSV(df_items,file_name_items)
storeCSV(df_comp,file_name_comp)