In [2]:
import gzip
import json
import nltk
import pandas as pd

from gensim import corpora
from gensim.parsing import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm

tqdm.pandas()

nltk.download("stopwords")

  from pandas import Panel
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/aquezada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopwords = {
    "spanish": stopwords.words("spanish"),
    "portuguese": stopwords.words("portuguese")
}

In [4]:
data = []

for language in tqdm(["spanish", "portuguese"]):
    for split in tqdm(["train", "test", "validation"]):
        df = pd.read_json(f"../data/meli-challenge-2019/{language}.{split}.jsonl.gz", lines=True)
        data.append(df)

data = pd.concat(data, ignore_index=True)
data.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))





Unnamed: 0,language,label_quality,title,category,split,tokenized_title,data,target,n_labels,size
0,spanish,reliable,Casita Muñecas Barbies Pintadas,DOLLHOUSES,train,"[casita, muñecas, barbies, pintadas]","[50001, 2, 50000, 3]",0,632,4895280
1,spanish,unreliable,Neceser Cromado Holográfico,TOILETRY_BAGS,train,"[neceser, cromado, holográfico]","[6, 4, 5]",1,632,4895280
2,spanish,unreliable,Funda Asiento A Medida D20 Chevrolet,CAR_SEAT_COVERS,train,"[funda, asiento, medida, chevrolet]","[9, 7, 10, 8]",2,632,4895280
3,spanish,unreliable,Embrague Ford Focus One 1.8 8v Td (90cv) Desde...,AUTOMOTIVE_CLUTCH_KITS,train,"[embrague, ford, focus, one]","[11, 13, 12, 14]",3,632,4895280
4,spanish,unreliable,Bateria Panasonic Dmwbcf10 Lumix Dmc-fx60n Dmc...,CAMERA_BATTERIES,train,"[bateria, panasonic, dmwbcf, lumix, dmc, fxn, ...","[15, 19, 17, 18, 16, 1, 1, 1]",4,632,4895280


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /users/aquezada/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
data.shape

(12578045, 10)

In [7]:
def clean_titles(row):
    title = preprocessing.strip_tags(row["title"].lower())
    title = preprocessing.strip_punctuation(title)
    title = preprocessing.strip_numeric(title)
    title = word_tokenize(title, language=row["language"])
    title = [word for word in title if word not in stopwords[row["language"]]]
    title = [word for word in title if len(word) >= 3]
    return title

data["tokenized_title"] = data.progress_apply(clean_titles, axis=1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12578045.0), HTML(value='')))




In [8]:
for language, lang_df in data.groupby(["language"]):
    dictionary = corpora.Dictionary(lang_df["tokenized_title"].tolist())
    dictionary.filter_extremes(no_below=2, no_above=1, keep_n=50000)
    dictionary.compactify()
    dictionary.patch_with_special_tokens({
        "[PAD]": 0,
        "[UNK]": 1
    })
    
    data.loc[lang_df.index, "data"] = lang_df["tokenized_title"].progress_map(
        lambda t: dictionary.doc2idx(
            document=t,
            unknown_word_index=1
        )
    )
    
    label_to_target = {label: index for index, label in enumerate(lang_df["category"].unique())}
    data.loc[lang_df.index, "target"] = lang_df["category"].progress_map(lambda l: label_to_target[l])
    
    with gzip.open(f"../data/meli-challenge-2019/{language}_token_to_index.json.gz", "wt") as fh:
        json.dump(dictionary.token2id, fh)

data.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6395265.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6395265.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6182780.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6182780.0), HTML(value='')))




Unnamed: 0,language,label_quality,title,category,split,tokenized_title,data,target,n_labels,size
0,spanish,reliable,Casita Muñecas Barbies Pintadas,DOLLHOUSES,train,"[casita, muñecas, barbies, pintadas]","[50001, 2, 50000, 3]",0,632,4895280
1,spanish,unreliable,Neceser Cromado Holográfico,TOILETRY_BAGS,train,"[neceser, cromado, holográfico]","[6, 4, 5]",1,632,4895280
2,spanish,unreliable,Funda Asiento A Medida D20 Chevrolet,CAR_SEAT_COVERS,train,"[funda, asiento, medida, chevrolet]","[9, 7, 10, 8]",2,632,4895280
3,spanish,unreliable,Embrague Ford Focus One 1.8 8v Td (90cv) Desde...,AUTOMOTIVE_CLUTCH_KITS,train,"[embrague, ford, focus, one]","[11, 13, 12, 14]",3,632,4895280
4,spanish,unreliable,Bateria Panasonic Dmwbcf10 Lumix Dmc-fx60n Dmc...,CAMERA_BATTERIES,train,"[bateria, panasonic, dmwbcf, lumix, dmc, fxn, ...","[15, 19, 17, 18, 16, 1, 1, 1]",4,632,4895280


In [10]:
n_labels = data.groupby(["language"])["target"].max().to_dict()
n_labels

{'portuguese': 772, 'spanish': 631}

In [11]:
split_size = data.groupby(["language", "split"]).size().to_dict()
split_size

{('portuguese', 'test'): 73918,
 ('portuguese', 'train'): 5057077,
 ('portuguese', 'validation'): 1264270,
 ('spanish', 'test'): 63680,
 ('spanish', 'train'): 4895280,
 ('spanish', 'validation'): 1223820}

In [12]:
data["n_labels"] = data.apply(lambda r: n_labels[r["language"]] + 1, axis=1)
data["size"] = data.apply(lambda r: split_size[(r["language"], r["split"])], axis=1)
data.head()

Unnamed: 0,language,label_quality,title,category,split,tokenized_title,data,target,n_labels,size
0,spanish,reliable,Casita Muñecas Barbies Pintadas,DOLLHOUSES,train,"[casita, muñecas, barbies, pintadas]","[50001, 2, 50000, 3]",0,632,4895280
1,spanish,unreliable,Neceser Cromado Holográfico,TOILETRY_BAGS,train,"[neceser, cromado, holográfico]","[6, 4, 5]",1,632,4895280
2,spanish,unreliable,Funda Asiento A Medida D20 Chevrolet,CAR_SEAT_COVERS,train,"[funda, asiento, medida, chevrolet]","[9, 7, 10, 8]",2,632,4895280
3,spanish,unreliable,Embrague Ford Focus One 1.8 8v Td (90cv) Desde...,AUTOMOTIVE_CLUTCH_KITS,train,"[embrague, ford, focus, one]","[11, 13, 12, 14]",3,632,4895280
4,spanish,unreliable,Bateria Panasonic Dmwbcf10 Lumix Dmc-fx60n Dmc...,CAMERA_BATTERIES,train,"[bateria, panasonic, dmwbcf, lumix, dmc, fxn, ...","[15, 19, 17, 18, 16, 1, 1, 1]",4,632,4895280


In [13]:
for (language, split), sub_df in data.groupby(["language", "split"]):
    sub_df.to_json(
        f"../data/meli-challenge-2019/{language}.{split}.jsonl.gz",
        lines=True,
        orient="records"
    )