In [None]:
import logging
from math import ceil
from itertools import chain

import pandas as pd
import numpy as np
import requests
from pandas_profiling import ProfileReport
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
threads = cpu_count() # Get computer avaiable threads
thread_pool = ThreadPool(threads)

In [None]:
APP_INFO = dict(app_id=7214923947282925,
                secret_key="")

In [None]:
AUTH = {"refresh_token":""}

In [None]:
BASE_URL = "https://api.mercadolibre.com/"
MAX_OFFSET = 1000
OFFSET = 50
MAX_REQUESTS = ceil(MAX_OFFSET / OFFSET)
PRECIO_DOLAR = 110

In [None]:
def parse_query(endpoint, filter_id=None , attr=None, query_params={}):
    args = list(filter(lambda x: x is not None, [endpoint, filter_id, attr]))
    query = BASE_URL + "/".join(args) + ("?" if query_params else "")
    query += "&".join(
        f"{key}={value}"
        for key, value in query_params.items()
        if query_params and (key and value) is not None
    )
    return query

In [None]:
def get_token(app_id, secret_key, refresh_token):
    params = dict(endpoint= "oauth",
                  attr = "token")
    params["query_params"] = {"grant_type": "refresh_token",
                             "client_id": app_id,
                             "client_secret": secret_key,
                             "refresh_token": refresh_token}
    query = parse_query(**params)
    req = requests.post(query)
    resp = req.json()
    token = resp.get("access_token")
    return token

In [None]:
def add_main_category(response, query_params):
    if query_params["category"] is None:
        raise Exception("A category must me supplied")
    for result in response['results']:
        result["main_category"] = query_params["category"]
    return response

In [None]:
def query_api(endpoint, filter_id=None, attr=None, query_params={}, include_category=False):
    """General function to query the API

    Parameters
    ----------
    endpoint : str
        Main API endpoint
    filter_id : str, optional
        Filter id (e.g $SITE_ID, $ITEM_ID) required for some endpoints, by default None
    attr : str, optional
        Attribute to extract from base response, by default None
    query_params : dict, optional
        Query values, parameters, and filters, by default {}

    Returns
    -------
    dict
        Json with API response
    """
    adapter = HTTPAdapter(max_retries=Retry(total=3))
    session = requests.Session()
    session.mount("https://", adapter)
    query = parse_query(endpoint, filter_id, attr, query_params)
    logging.debug(f"Querying {query}")
    resp = session.get(query, timeout=20)
    if resp.ok:
        response = resp.json()
        if include_category:
            response = add_main_category(response, query_params)
        return response
    else:
        raise Exception(resp.json().get("message", ""))
        

In [None]:
def get_max_results(endpoint, filter_id, attr, query_params, include_category, results=[]):
    query_params = query_params.copy()
    for current_offset in range(0, MAX_OFFSET+1, OFFSET):
        logging.debug(f"Request {int(current_offset/OFFSET)} from {MAX_REQUESTS}")
        query_params['offset'] = current_offset
        info = query_api(endpoint, filter_id, attr, query_params, include_category)
        try:
            results += info['results']
        except Exception:
            logging.exception("Error fetching data")
    return results

In [None]:
def get_categories_subcategories(categories):
    full_cats = {}
    for i, category in enumerate(categories):
        logging.debug(f"Category {i} of {len(categories)}")
        category_id = category['id']
        resp = query_api(endpoint="categories", filter_id=category_id)
        subcats = [subcat.get("id") for subcat in resp['children_categories']]
        full_cats[category_id] = subcats
    return full_cats
        

In [None]:
def get_all_data(endpoint, filter_id, attr, full_cats, include_category, get_subcats=False):
    data = []
    if get_subcats:
        cats = list(chain(*full_cats.values()))
    else:
        cats = list(full_cats.keys())
    query_params = [{"category": cat, "access_token": TOKEN} for cat in cats]
    try:
        thread_pool.map(lambda x: get_max_results(endpoint, filter_id, attr, x, include_category, data), query_params)
    except Exception:
        logging.exception("A request failed")
    return data

In [None]:
TOKEN = get_token(APP_INFO['app_id'], APP_INFO['secret_key'], AUTH['refresh_token'])

In [None]:
sites = query_api(endpoint="sites")
arg_site_id = list(filter(lambda x: x["name"] == "Argentina", sites))[0]['id']
categories = query_api(endpoint="sites", filter_id=arg_site_id, attr="categories")
categories = list(filter(lambda x: x["name"] != "Servicios", categories))

In [None]:
cats = get_categories_subcategories(categories)

In [None]:
args = dict(endpoint="sites",
            filter_id=arg_site_id,
            attr="search",
            full_cats=cats,
            include_category=True)

In [None]:
a = get_all_data(**args)

In [None]:
df = pd.json_normalize(a)

Veamos un poco la estructura que tienen nuestros datos

In [None]:
profile = ProfileReport(df, title="Report")
profile.to_notebook_iframe()

Eliminemos algunas variables:
- site_id: para este analisis nos vamos a estar concentrando unicamente en publicaciones de Argentina
- id, link y permalink: valores unicos
- attributes: para este analisis y por simplicidad, eliminemosla, dado que su estructura es compleja y observandolo en detalle, el attributo mas interesante puede llegar a ser brand y no esta presente siempre
- seller: seller tiene una infinidad de variables, de las cuales muchas son NA y otras son posibles de conseguir en la respuesta base (city, state, etc). Nos quedamos solo con el id y su status.
- installments: currency ya esta declarada en las variables regulares, y rate correlaciona perfecto con quantity

In [None]:
def process_data(dframe):
    # Remove undesired columns
    df = dframe.copy()
    undesired_seller = [col for col in df if (col.startswith("seller") and col not in ("seller.id", "seller.power_seller_status"))]
    undesired_location = [col for col in df if col.startswith("location")]
    undesired_shipping = ["shipping.tags", "shipping.mode"]
    undesired_address = ["address.state_name", "address.city_id", "address.area_code", "address.phone1"] # El state_id aparece siempre y no requiere tratamiento de strings. Por lo contrario el city_name tiene mas data que city_id
    undesired_installments = ["installments.currency_id", "installments.rate"]
    undesired_regular = ["id", "site_id", "permalink", "thumbnail", "title", "attributes", "stop_time", "installments"]
    undesired_cols = undesired_seller + undesired_shipping + undesired_address + undesired_installments + undesired_regular + undesired_location
    df.drop(undesired_cols, axis=1, inplace=True)
    # Create columns
    df['has_discount'] = df['original_price'].notnull()
    df['discount'] = ((df['original_price'] - df['price'])/ df['original_price']).fillna(0)
    df.drop('original_price', axis=1, inplace=True)
    # Ajusto precios dolarizados
    df["price"] = np.where(df['currency_id'] == "USD", df['price'] * PRECIO_DOLAR , df['price'])
    df.drop("currency_id", axis=1, inplace=True)
    # Unpack tags
    df["tags"] = df["tags"].apply(lambda x: " ".join(x)).apply(lambda x: x.replace("-", "_"))
    cvect = CountVectorizer()
    tags = cvect.fit_transform(df["tags"]).toarray()
    for i, col in enumerate(cvect.get_feature_names()):
        df[col] = tags[:, i]
    df.drop('tags', axis=1, inplace=True)
    #Transform city_address name
    df['address.city_name'] = df['address.city_name'].apply(lambda x: x.lower())
    return df

In [None]:
final_df = process_data(df)

In [None]:
y = final_df['sold_quantity']
X = final_df.drop('sold_quantity', axis=1)

In [None]:
numeric_features = ["price", "discount", "available_quantity", "installments.quantity", "installments.amount"]
X[numeric_features] = X[numeric_features].apply(lambda x: pd.to_numeric(x, errors="coerce"))
categorical_features = list(X.drop(numeric_features, axis=1).columns)
X[categorical_features] = X[categorical_features].applymap(str)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
numeric_transformer = Pipeline(
    [('imputer', SimpleImputer(strategy='median'))]
)
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
estimator = LGBMRegressor()
pipe = Pipeline([('preprocessor', preprocessor), ('estimator', estimator)])
pipe.fit(X_train, y_train)

In [None]:
ohe_feature_names = pipe.steps[0][1].transformers_[1][1].steps[1][1].get_feature_names(categorical_features)

In [None]:
feature_importances = pipe.steps[1][1].feature_importances_

In [None]:
cols = list(numeric_features) + list(ohe_feature_names)
f_i = list(zip(cols, feature_importances))

In [None]:
pd.DataFrame(f_i, columns=["feature_names", "importance"]).sort_values('importance', ascending=False).reset_index(drop=True).query("importance > 0")

In [None]:
len(pipe.steps[1][1].feature_importances_)

In [None]:
cv = KFold(n_splits=4)

scores = cross_val_score(pipe, X_train, y_train, cv = cv, scoring='neg_mean_absolute_error')