In [347]:
import requests
import pandas as pd
from dataclasses import dataclass
import warnings
import spacy
from typing import Protocol
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import unicodedata


warnings.filterwarnings("ignore")
pd.options.display.max_columns = 100


In [348]:
@dataclass
class MeLiClient:
    site: str

    def get_categories(self) -> pd.DataFrame:
        url = f"https://api.mercadolibre.com/sites/{self.site}/categories"
        return pd.DataFrame(requests.get(url).json())

    def get_items_in_category(self, category_id: str, offset=0) -> pd.DataFrame:

        url = f"https://api.mercadolibre.com/sites/{self.site}/search?category={category_id}&offset={offset}"
        request = requests.get(url)
        items = request.json()
        try:
            return pd.DataFrame(items["results"])
        except Exception:
            return pd.DataFrame()

    def get_all_items_in_category(self, category_id: str) -> pd.DataFrame:

        offset = 0
        results = []

        result = self.get_items_in_category(category_id)
        results.append(result)

        while len(result) > 0:
            offset += 50

            result = self.get_items_in_category(category_id, offset=offset)
            results.append(result)

        return pd.concat(results)


site = "MCO"
meli = MeLiClient(site)


In [349]:
categories = meli.get_categories()
categories.query("id=='MCO1000'")


Unnamed: 0,id,name
17,MCO1000,"Electrónica, Audio y Video"


In [350]:
# https://developers.mercadolibre.com.ar/es_co/atributos

# items[items["attributes"].apply(lambda sku: 'DESCRIPTIVE_TAGS' in (x["id"] for x in sku))] # empty
# items[items["attributes"].apply(lambda sku: 'PRODUCT_FEATURES' in (x["id"] for x in sku))] # empty


In [351]:
def get_attribute_value(attributes: list, attribute_name: str):
    return next(
        (x["value_name"] for x in attributes if x["id"] == attribute_name),
        None,
    )


def build_dataset(
    meli: MeLiClient,
    category_id: str,
    features: list[str],
    attributes: list[str],
    save=True,
):

    items = meli.get_all_items_in_category(category_id)
    df = items[features]

    for attribute in attributes:
        df[attribute.lower()] = items.attributes.apply(
            get_attribute_value, attribute_name=attribute
        )

    df = df.drop(columns="attributes").dropna(how="all", axis=1).reset_index(drop=True)

    if save:
        df.to_csv(f"{category_id}_items.csv")

    return df


FEATURES = ["id", "title", "thumbnail", "domain_id", "attributes"]
INTERESTING_ATTRIBUTES = ["GTIN", "BRAND", "MODEL"]
category = "MCO1000"  # categories.id.sample().squeeze()
print(category)


df = build_dataset(meli, category, FEATURES, INTERESTING_ATTRIBUTES)
df


MCO1000


Unnamed: 0,id,title,thumbnail,domain_id,brand,model
0,MCO618049088,Audífonos Sony Zx Series Mdr-zx110 Negro,http://http2.mlstatic.com/D_975252-MLA45258514...,MCO-HEADPHONES,Sony,MDR-ZX110
1,MCO627599757,Mini Drone Dji Mavic Mini 2 Drdji018 Fly More ...,http://http2.mlstatic.com/D_888397-MLA47979312...,MCO-DRONES,DJI,Mini 2
2,MCO616438574,Audífonos In-ear Jbl Tune 110 Black,http://http2.mlstatic.com/D_798368-MLA46444369...,MCO-HEADPHONES,JBL,110
3,MCO838086582,Soporte North Bayou Nb-p4 De Pared Para Tv/mon...,http://http2.mlstatic.com/D_831097-MLA46523626...,MCO-TV_AND_MONITOR_MOUNTS,North Bayou,NB-P4
4,MCO870314784,Google Chromecast 3.ª Generación Full Hd Carbón,http://http2.mlstatic.com/D_877407-MLA45315730...,MCO-STREAMING_MEDIA_DEVICES,Google,Chromecast
...,...,...,...,...,...,...
1044,MCO594300136,"Parlante Pequeño Bluetooth , Usb Y Radio Reca...",http://http2.mlstatic.com/D_672113-MCO44057050...,MCO-SPEAKERS,Specker,Pequeño
1045,MCO548760665,Cable Hdmi 30 Metros Mallado Doble Filtro Punt...,http://http2.mlstatic.com/D_920327-MCO40429690...,MCO-AUDIO_AND_VIDEO_CABLES_AND_ADAPTERS,HDMI,30M
1046,MCO499242229,Sensor Inductivo Sensor Proximidad Lj12a3-4-z/...,http://http2.mlstatic.com/D_782190-MCO43648332...,MCO-INDUCTIVE_SENSORS,Generico,EL DESCRITO EN LA PUBLICACION
1047,MCO545193597,Energeizer Pila Alkalina Grande Tipo D X 2 Und,http://http2.mlstatic.com/D_977856-MCO44257523...,MCO-CELL_BATTERIES,Energizer,ALKALINA


In [None]:
def preprocessor(text: str):
    text = unicodedata.normalize("NFKD", text)
    return text.lower().strip()


def tokenizer(text: str):
    doc = nlp(text)  # probably overkill using spacy for this but ok
    tokens = [word.lemma_ for word in doc if not word.is_stop and not word.is_punct]
    return tokens


nlp = spacy.load("es_core_news_md")
bow = CountVectorizer(tokenizer=tokenizer, preprocessor=preprocessor).fit(
    df.title.values
)
tfidf = TfidfVectorizer(tokenizer=tokenizer, preprocessor=preprocessor).fit(
    df.title.values
)


In [352]:
class Embedding(Protocol):
    def similarity(self, other: "Embedding") -> float:
        ...


In [353]:
class SpacyEmbedding:
    def __init__(self, text: str):
        self.embedding = nlp(text)

    def similarity(self, other: "SpacyEmbedding") -> float:
        return self.embedding.similarity(other.embedding)

    def __repr__(self):
        return str(tuple(self.embedding))


In [423]:
class BagOfWordsEmbedding:
    def __init__(self, text: str):
        self.embedding = bow.transform([text])[0]
        self.tokens = tokenizer(text)

    def similarity(self, other: "BagOfWordsEmbedding") -> float:
        return cosine_similarity(self.embedding, other.embedding)[0][0]

    def __repr__(self):
        return str(tuple(self.tokens))


In [None]:
class TFIDFEmbedding:
    def __init__(self, text: str):
        self.embedding = tfidf.transform([text])[0]
        self.tokens = tokenizer(text)

    def similarity(self, other: "TFIDFEmbedding") -> float:
        return cosine_similarity(self.embedding, other.embedding)[0][0]

    def __repr__(self):
        return str(tuple(self.tokens))


In [424]:
EMBEDDING_COLS = ["title"]  # , "brand", "model"]
embedding_type = BagOfWordsEmbedding

for col in EMBEDDING_COLS:
    df[f"{col}_embedding"] = df[col].apply(embedding_type)
df


Unnamed: 0,id,title,thumbnail,domain_id,brand,model,title_embedding
0,MCO618049088,Audífonos Sony Zx Series Mdr-zx110 Negro,http://http2.mlstatic.com/D_975252-MLA45258514...,MCO-HEADPHONES,Sony,MDR-ZX110,"('Audífonos', 'Sony', 'Zx', 'Series', 'Mdr-zx1..."
1,MCO627599757,Mini Drone Dji Mavic Mini 2 Drdji018 Fly More ...,http://http2.mlstatic.com/D_888397-MLA47979312...,MCO-DRONES,DJI,Mini 2,"('Mini', 'Drone', 'Dji', 'Mavic', 'Mini', '2',..."
2,MCO616438574,Audífonos In-ear Jbl Tune 110 Black,http://http2.mlstatic.com/D_798368-MLA46444369...,MCO-HEADPHONES,JBL,110,"('Audífonos', 'in-ear', 'Jbl', 'Tune', '110', ..."
3,MCO838086582,Soporte North Bayou Nb-p4 De Pared Para Tv/mon...,http://http2.mlstatic.com/D_831097-MLA46523626...,MCO-TV_AND_MONITOR_MOUNTS,North Bayou,NB-P4,"('Soporte', 'North', 'Bayou', 'Nb-p4', 'Pared'..."
4,MCO870314784,Google Chromecast 3.ª Generación Full Hd Carbón,http://http2.mlstatic.com/D_877407-MLA45315730...,MCO-STREAMING_MEDIA_DEVICES,Google,Chromecast,"('Google', 'Chromecast', '3.ª', 'Generación', ..."
...,...,...,...,...,...,...,...
1044,MCO594300136,"Parlante Pequeño Bluetooth , Usb Y Radio Reca...",http://http2.mlstatic.com/D_672113-MCO44057050...,MCO-SPEAKERS,Specker,Pequeño,"(' ', 'Parlante', 'Pequeño', 'Bluetooth', 'Usb..."
1045,MCO548760665,Cable Hdmi 30 Metros Mallado Doble Filtro Punt...,http://http2.mlstatic.com/D_920327-MCO40429690...,MCO-AUDIO_AND_VIDEO_CABLES_AND_ADAPTERS,HDMI,30M,"('Cable', 'Hdmi', '30', 'Metros', 'Mallado', '..."
1046,MCO499242229,Sensor Inductivo Sensor Proximidad Lj12a3-4-z/...,http://http2.mlstatic.com/D_782190-MCO43648332...,MCO-INDUCTIVE_SENSORS,Generico,EL DESCRITO EN LA PUBLICACION,"('Sensor', 'Inductivo', 'Sensor', 'Proximidad'..."
1047,MCO545193597,Energeizer Pila Alkalina Grande Tipo D X 2 Und,http://http2.mlstatic.com/D_977856-MCO44257523...,MCO-CELL_BATTERIES,Energizer,ALKALINA,"('Energeizer', 'Pila', 'Alkalina', 'grande', '..."


In [425]:
def find_similarities(
    df: pd.DataFrame, embedding: Embedding, feature="title_embedding"
):
    similarities = df[feature].apply(embedding.similarity)
    return similarities


find_similarities(df, df.title_embedding.iloc[0])


0       1.000000
1       0.000000
2       0.166667
3       0.105409
4       0.000000
          ...   
1044    0.000000
1045    0.000000
1046    0.000000
1047    0.000000
1048    0.000000
Name: title_embedding, Length: 1049, dtype: float64

In [426]:
def feature_exact_match(df: pd.DataFrame, feature: str, feature_value):
    return df[feature] == feature_value


feature_exact_match(df, "brand", "Hill's")


0       False
1       False
2       False
3       False
4       False
        ...  
1044    False
1045    False
1046    False
1047    False
1048    False
Name: brand, Length: 1049, dtype: bool

In [427]:
def find_similar_products(
    df: pd.DataFrame,
    product_id: str,
    embedding_feature="title_embedding",
    penalty_features=["brand", "domain_id"],
    penalty_value=0.1,
    threshold=0.7,
):
    details = df.copy()
    product = df.set_index("id").loc[product_id]
    details["similarity"] = find_similarities(
        df, product[embedding_feature], embedding_feature
    )
    details["raw_similarity"] = details.similarity.copy()

    for penalty_feature in penalty_features:
        is_exact_match = feature_exact_match(
            df, penalty_feature, product[penalty_feature]
        )
        details[f"{penalty_feature}_penalty"] = (
            ~is_exact_match * penalty_value if pd.notna(product[penalty_feature]) else 0
        )
        details["similarity"] -= details[f"{penalty_feature}_penalty"]

    products_above_threshold = details[details["similarity"] > threshold]
    similar_products = details.loc[products_above_threshold.index]
    return similar_products, details


PENALTY_FEATURES = ["brand", "domain_id", "model"]
find_similar_products(df, "MCO627599757", penalty_features=PENALTY_FEATURES)[0]


Unnamed: 0,id,title,thumbnail,domain_id,brand,model,title_embedding,similarity,raw_similarity,brand_penalty,domain_id_penalty,model_penalty
1,MCO627599757,Mini Drone Dji Mavic Mini 2 Drdji018 Fly More ...,http://http2.mlstatic.com/D_888397-MLA47979312...,MCO-DRONES,DJI,Mini 2,"('Mini', 'Drone', 'Dji', 'Mavic', 'Mini', '2',...",1.0,1.0,0.0,0.0,0.0
277,MCO614434985,Drone Dji Mavic Mini 2 Combo | Dji Mini 2 Fly ...,http://http2.mlstatic.com/D_807591-MCO45405669...,MCO-DRONES,DJI,Mini 2,"('Drone', 'Dji', 'Mavic', 'Mini', '2', 'Combo'...",0.720082,0.720082,0.0,0.0,0.0
374,MCO604193667,Drone Dji Mavic Mini 2 Fly More Combo 4k Envío...,http://http2.mlstatic.com/D_638855-MCO44718634...,MCO-DRONES,DJI,Mini 2,"('Drone', 'Dji', 'Mavic', 'Mini', '2', 'Fly', ...",0.710669,0.710669,0.0,0.0,0.0
498,MCO655218921,Mini Drone Dji Mavic Mini 2 Drdji017 Single Co...,http://http2.mlstatic.com/D_627189-MLA47977636...,MCO-DRONES,DJI,Mini 2,"('Mini', 'Drone', 'Dji', 'Mavic', 'Mini', '2',...",0.707107,0.707107,0.0,0.0,0.0


In [None]:
def load_test_dataset(filename: str):
    return pd.read_csv(filename)


load_test_dataset("test_electronica.csv")
