In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from spacy.tokenizer import Tokenizer
from sklearn.metrics import balanced_accuracy_score
from multiprocessing import  Pool
import numpy as np
import fasttext
import csv
import unicodedata

## Preprocess

In [15]:
# Load Dataset
data_train = pd.read_csv('../data/train.csv',nrows=1000000)
data_test = pd.read_csv('../data/test.csv')

In [None]:
data_train.iloc[5341]

In [18]:
data_train[0:5]["title"].values

array(['Hidrolavadora Lavor One 120 Bar 1700w  Bomba Aluminio Italia',
       'Placa De Sonido - Behringer Umc22',
       'Maquina De Lavar Electrolux 12 Kilos',
       'Par Disco De Freio Diant Vent Gol 8v 08/ Fremax Bd5298',
       'Flashes Led Pestañas Luminoso Falso Pestañas Para Partido '],
      dtype=object)

In [29]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [22]:
def normalize_text(text,nlp):
    s = []
    for tok in nlp.tokenizer(text):
        if tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1):
            if not tok.is_ascii:
                tok = ''.join(c for c in unicodedata.normalize('NFD', tok.text.lower()) if unicodedata.category(c) != 'Mn')
                s.append(tok)
            else:
                s.append(tok.text.lower())
    s = ' '.join(s)
    return s


In [None]:
nlp_es = Spanish()
for text in data_train[0:5]["title"].values:
    print(text,"||", normalize_text(text, nlp_es))


In [32]:
def preprocess(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df[["label","tokens"]]

In [33]:
def preprocess_test(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    return df[["id","tokens"]]

In [34]:
def create_fasttext_split_files(train_df, test_df):
    # train and validation set files
    train = parallelize_dataframe(train_df, preprocess)
    X_train, X_val, y_train, y_val = train_test_split(train["tokens"], train["label"], test_size=0.05, random_state=42, stratify=train["label"])
    train_fasttext = pd.concat([y_train,X_train], axis=1)
    val_fasttext = pd.concat([y_val,X_val], axis=1)
    train_fasttext.to_csv('../data/small/train_fasttext_norm.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    val_fasttext.to_csv('../data/small/val_fasttext_norm.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")   
    
    #test set file
    test = parallelize_dataframe(test_df, preprocess_test)
    test["tokens"].to_csv("../data/small/test_fasttext_norm.txt",index=False,header=False,line_terminator='\n')

In [35]:
%time create_fasttext_split_files(data_train, data_test)

CPU times: user 8.17 s, sys: 1.01 s, total: 9.18 s
Wall time: 1min 34s


## Training

In [40]:
%time model = fasttext.train_supervised(input="../data/small/train_fasttext_norm.csv", epoch=5, lr=0.5, wordNgrams=2, thread=8)

CPU times: user 38min 16s, sys: 1.12 s, total: 38min 17s
Wall time: 4min 51s


In [42]:
%time model.test('../data/small/val_fasttext_norm.csv')

CPU times: user 13.1 s, sys: 84 ms, total: 13.1 s
Wall time: 13 s


(50000, 0.82822, 0.82822)

In [None]:
model.predict('bici playera',5)

In [None]:
model.save_model("../models/small/model1.bin")

## Predict

In [None]:
test_data = pd.read_csv('../data/small/test_fasttext.csv',names=['tokens'])

In [None]:
predictions = model.predict(test["tokens"].values.tolist())

In [None]:
submission = pd.Series([x[0][9:] for x in predictions[0]])

In [None]:
submission.to_csv("./submissions/small/submission1.txt")