In [78]:
# Standard library imports.
import os
import requests
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Third party imports.
import pandas as pd
from tqdm import tqdm

In [40]:
class MELIData:
    def __init__(self) -> None:
        self.meli_url = "https://api.mercadolibre.com/"
        self.dataset_path = "../data/raw"

    def get_sites(self):
        url = f"{self.meli_url}/sites"
        request = requests.get(url)
        sites = request.json()
        return sites

    def get_categories(self, site_id):
        url = f"{self.meli_url}/sites/{site_id}/categories"
        request = requests.get(url)
        categories = request.json()
        return categories

    def search_item_by_category(self, site_id, cat_id, offset=50):
        url = f"{self.meli_url}/sites/{site_id}/search"
        params = {'category': cat_id, 'offset': offset}
        request = requests.get(url, params=params)
        items = request.json()
        return items

    def get_item_features(self, item_id):
        url = f"{self.meli_url}/items/{item_id}"
        request = requests.get(url)
        features = request.json()
        return features

    def convert_and_save_dataframe(self, arr, df_name):
        dataframe = pd.DataFrame(arr)
        dataframe.to_csv(f"{self.dataset_path}/{df_name}.csv", index=False)

    def read_df(self, df_name):
        dataframe = pd.read_csv(f"{self.dataset_path}/{df_name}.csv")
        return dataframe

In [41]:
melidataset = MELIData()

In [42]:
# Get sites - each country has its ID
sites = melidataset.get_sites()
print("sites:", type(sites))
print("sites[0]:", type(sites[0]))

sites: <class 'list'>
sites[0]: <class 'dict'>


In [43]:
melidataset.convert_and_save_dataframe(sites, "sites")

In [44]:
# Get categories - Categories of items belonging to the Marketplace
site_id = 'MCO'
categories = melidataset.get_categories(site_id)
print("categories:", type(categories))
print("categories[0]:", type(categories[0]))

categories: <class 'list'>
categories[0]: <class 'dict'>


In [45]:
melidataset.convert_and_save_dataframe(categories, "categories_MCO")

In [None]:
# Get items
items = pd.DataFrame()
item_attributes = pd.DataFrame()
item_pictures = pd.DataFrame()

site_id = 'MCO'
df_name = f"categories_{site_id}"
categories = melidataset.read_df(df_name)

# Get the first 1000 items per search
# for each category in the marketplace.
offsets = list(range(0, 1000, 50))

for _, row in tqdm(categories.iterrows(), total=len(categories)):
    # print("key:", type(key))
    category_id = row['id']

    for offset in tqdm(offsets, total=len(offsets)):
        item_cats = melidataset.search_item_by_category(site_id, category_id, offset)
        if 'results' in item_cats:
            resulting_items = item_cats['results']
            if resulting_items is not None:
                for resulting_item in tqdm(resulting_items, total=len(resulting_items)):
                    # The content of 'shipping' is a dictionary.
                    # For this reason it is decided to take the keys and their values
                    # and add them directly to the content of the item
                    shipping_subdict = resulting_item.pop('shipping')
                    if shipping_subdict is not None:
                        resulting_item.update(shipping_subdict)

                    # The content of 'installments' is a dictionary.
                    # For this reason it is decided to take the keys and their values
                    # and add them directly to the content of the item
                    installments_subdict = resulting_item.pop('installments')
                    if installments_subdict is not None:
                        resulting_item.update(installments_subdict)

                    # Add the seller id directly to the item content.
                    resulting_item['seller_id'] = resulting_item['seller']['id']
                    resulting_item.pop('seller')

                    # The item attributes are a list of dictionaries.
                    # It is decided to save them as a separate file
                    attributes = resulting_item['attributes']
                    attributes_ = [{
                        'site_id': site_id,
                        'category_id': category_id,
                        'item_id': resulting_item['id'],
                        **attr,
                    } for attr in attributes]
                    resulting_item.pop('attributes')

                    # The features obtained from the items are not complete.
                    # For this reason, an additional query is made and the relevant keys are stored.
                    features = melidataset.get_item_features(resulting_item['id'])
                    resulting_item['initial_quantity'] = features['initial_quantity']
                    resulting_item['warranty'] = features['warranty']

                    # The 'pictures' are a list of dictionaries.
                    # It is decided to save them as a separate file
                    pictures = features['pictures']
                    pictures_ = [{
                        'site_id': site_id,
                        'category_id': category_id,
                        'item_id': resulting_item['id'],
                        **attr,
                    } for attr in pictures]

                    items_df = pd.DataFrame([resulting_item])
                    item_attributes_df = pd.DataFrame(attributes_)
                    item_pictures_df = pd.DataFrame(pictures_)
                
                    items = pd.concat([items, items_df], ignore_index=True)
                    item_attributes = pd.concat([item_attributes, item_attributes_df], ignore_index=True)
                    item_pictures = pd.concat([item_pictures, item_pictures_df], ignore_index=True)
                os.system('clear')

melidataset.convert_and_save_dataframe(items, "items_MCO")

item_attributes.drop(['values'], axis=1)
melidataset.convert_and_save_dataframe(item_attributes, "item_attributes_MCO")

melidataset.convert_and_save_dataframe(item_pictures, "item_pictures_MCO")

In [68]:
items = melidataset.read_df("items_MCO")
item_attributes = melidataset.read_df("item_attributes_MCO")
item_pictures = melidataset.read_df("item_pictures_MCO")

  dataframe = pd.read_csv(f"{self.dataset_path}/{df_name}.csv")


In [69]:
item_cols = list(items.columns)
print("Item cols:", item_cols, len(item_cols), end="\n\n")
print(items.head())

Item cols: ['id', 'title', 'condition', 'thumbnail_id', 'catalog_product_id', 'listing_type_id', 'permalink', 'buying_mode', 'site_id', 'category_id', 'domain_id', 'thumbnail', 'currency_id', 'order_backend', 'price', 'original_price', 'sale_price', 'available_quantity', 'official_store_id', 'use_thumbnail_id', 'accepts_mercadopago', 'stop_time', 'winner_item_id', 'catalog_listing', 'discounts', 'promotions', 'inventory_id', 'store_pick_up', 'free_shipping', 'logistic_type', 'mode', 'tags', 'benefits', 'promise', 'quantity', 'amount', 'rate', 'seller_id', 'differential_pricing', 'variation_filters', 'variations_data', 'official_store_name', 'location', 'seller_contact'] 44

              id                                              title condition  \
0  MCO2196744036  Intercomunicador Auriculares Casco Bluetooh Bt...       new   
1  MCO1403134263  Espejo Retrovisor Con Cámara Táctil 9.66 Inch ...       new   
2  MCO1312378271                Llanta Hifly Hf201 P 185/60r14 82 H       

In [70]:
item_attribute_cols = list(item_attributes.columns)
print("Item attribute cols:", item_attribute_cols, len(item_attribute_cols), end="\n\n")
print(item_attributes.head())

Item attribute cols: ['item_id', 'id', 'name', 'value_id', 'value_name', 'attribute_group_id', 'attribute_group_name', 'value_struct', 'source', 'value_type'] 10

         item_id              id                          name    value_id  \
0  MCO2196744036           BRAND                         Marca  19945578.0   
1  MCO2196744036  ITEM_CONDITION            Condición del ítem   2230284.0   
2  MCO2196744036           MODEL                        Modelo   5856612.0   
3  MCO1403134263           BRAND                         Marca    276243.0   
4  MCO1403134263            GTIN  Código universal de producto         NaN   

      value_name attribute_group_id attribute_group_name value_struct  \
0          Sikeo             OTHERS                Otros          NaN   
1          Nuevo             OTHERS                Otros          NaN   
2          BT 12             OTHERS                Otros          NaN   
3       Genérica             OTHERS                Otros          NaN   
4  

In [65]:
item_picture_cols = list(item_pictures.columns)
print("Item picture cols:", item_picture_cols, len(item_picture_cols), end="\n\n")
print(item_pictures.head())

Item feature cols: ['id', 'site_id', 'title', 'seller_id', 'category_id', 'official_store_id', 'price', 'base_price', 'original_price', 'currency_id', 'initial_quantity', 'sale_terms', 'buying_mode', 'listing_type_id', 'condition', 'permalink', 'thumbnail_id', 'thumbnail', 'pictures', 'video_id', 'descriptions', 'accepts_mercadopago', 'non_mercado_pago_payment_methods', 'shipping', 'international_delivery_mode', 'seller_address', 'seller_contact', 'location', 'coverage_areas', 'attributes', 'listing_source', 'variations', 'status', 'sub_status', 'tags', 'warranty', 'catalog_product_id', 'domain_id', 'parent_item_id', 'deal_ids', 'automatic_relist', 'date_created', 'last_updated', 'health', 'catalog_listing'] 45

              id site_id                                              title  \
0  MCO2196744036     MCO  Intercomunicador Auriculares Casco Bluetooh Bt...   
1  MCO1403134263     MCO  Espejo Retrovisor Con Cámara Táctil 9.66 Inch ...   
2  MCO1312378271     MCO                L