In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from spacy.tokenizer import Tokenizer
from sklearn.metrics import balanced_accuracy_score
from multiprocessing import  Pool
import numpy as np
import fasttext
import csv
import unicodedata

## Preprocess

In [3]:
# Load Dataset
data_train = pd.read_csv('../data/train.csv')
data_test = pd.read_csv('../data/test.csv')

In [4]:
def normalize_text(text,nlp):
    s = []
    for tok in nlp.tokenizer(text):
        if tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1):
            if not tok.is_ascii:
                tok = ''.join(c for c in unicodedata.normalize('NFD', tok.text.lower()) if unicodedata.category(c) != 'Mn')
                s.append(tok)
            else:
                s.append(tok.text.lower())
    s = ' '.join(s)
    return s

In [7]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [5]:
def preprocess(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df[["label","tokens"]]

In [6]:
def preprocess_test(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    return df[["id","tokens"]]

In [9]:
def create_fasttext_split_files(train_df, test_df):
    # train and validation set files
    train = parallelize_dataframe(train_df, preprocess)
    X_train, X_val, y_train, y_val = train_test_split(train["tokens"], train["label"], test_size=0.05, random_state=42, stratify=train["label"])
    train_fasttext = pd.concat([y_train,X_train], axis=1)
    val_fasttext = pd.concat([y_val,X_val], axis=1)
    train_fasttext.to_csv('../data/train_fasttext_norm.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    val_fasttext.to_csv('../data/val_fasttext_norm.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")   
    
    #test set file
    test = parallelize_dataframe(test_df, preprocess_test)
    test["tokens"].to_csv("../data/test_fasttext_norm.txt",index=False,header=False,line_terminator='\n')

In [10]:
%time create_fasttext_split_files(data_train, data_test)

CPU times: user 3min 38s, sys: 21.1 s, total: 3min 59s
Wall time: 26min 7s


## Training

In [2]:
%time model = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=5, lr=0.5, wordNgrams=2, thread=8)

CPU times: user 12h 55min 58s, sys: 22.4 s, total: 12h 56min 20s
Wall time: 1h 37min 34s


In [3]:
%time model.test('../data/val_fasttext.csv')

CPU times: user 4min 26s, sys: 1.38 s, total: 4min 28s
Wall time: 4min 25s


(1000000, 0.881847, 0.881847)

In [None]:
model.predict('bici playera',5)

In [4]:
model.save_model("../models/model2.bin")

In [None]:
## Test

In [None]:
[x[0] for x in predictions[0]]

In [None]:
data_test.columns

In [None]:
%time model2 = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=10, lr=0.5, wordNgrams=2, loss='hs', thread=8)

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model2.test('../data/test_fasttext.csv'))

In [None]:
%time model2 = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=5, lr=0.5, wordNgrams=2, loss='hs', thread=8)

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model2.test('../data/test_fasttext.csv'))

In [None]:
%time model3 = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=5, lr=0.8, wordNgrams=2, loss='hs', thread=8)

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model2.test('../data/test_fasttext.csv'))

## Predict

In [18]:
model = fasttext.load_model("../models/model_norm1.bin")




In [14]:
test_data = pd.read_csv('../data/test_fasttext_norm.txt',names=['tokens'])

In [15]:
test_data.replace(np.nan, 'notitle',inplace=True)

In [19]:
%time predictions = model.predict(test_data["tokens"].values.tolist())

CPU times: user 1min 6s, sys: 164 ms, total: 1min 6s
Wall time: 1min 6s


In [None]:
submission = pd.Series([x[0][9:] for x in predictions[0]])

In [32]:
submission.to_csv("./submissions/submission3.csv")

  """Entry point for launching an IPython kernel.


In [33]:
submission

0                                  DIAPER_BAGS
1                           BABY_CHANGING_PADS
2                    ENGINE_COOLING_FAN_MOTORS
3         AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS
4                               BABY_CAR_SEATS
                          ...                 
246950                     VEHICLE_BRAKE_DISCS
246951                          WALKIE_TALKIES
246952                             CALCULATORS
246953                             DINING_SETS
246954                           WASTE_BASKETS
Length: 246955, dtype: object

In [8]:
test_data["tokens"].values.tolist()

['kit maternidade bolsa mala baby bebe vinho menina',
 'trocador de fraldas fisher price feminino rosa portátil',
 'motor ventoinha fiat idea palio',
 'amortecedor mola batente dir new civic',
 'cadeirinha de carro bebê princesa princess kgs',
 'cabo freio mao tras direito vw up cod',
 'mini pc dell optiplex atom gb ram ssd gb wifi',
 'kit bi xenon lâmpada',
 'protetor pé botinha kickboxing karate taekwondo competição',
 'disco rigido externo western digital elements tb',
 'picadora de carne fineschi legitima',
 'saída maternidade manta brinde',
 'jogo vela ignicao ngk mercedes benz clc kompressor',
 'modulo de injeção fox flex aa',
 'set barreta automotor bremen unid cm',
 'miel organica tacc',
 'sax tenor coon',
 'sapato seguranca vaqueta bidensidade com biqueira de pvc',
 'adaptador para cartão de memória micro sd frete grátis',
 'bandeja giradiscos omnitronic',
 'tonfa militar',
 'kit almohadones funda respaldos caramelos eco cuero',
 'parlante portatil hp mini roar bluetooth rojo 