In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from spacy.tokenizer import Tokenizer
from sklearn.metrics import balanced_accuracy_score
from multiprocessing import  Pool
import numpy as np
import fasttext
import csv

In [16]:
# Load Dataset
data_raw = pd.read_csv('../data/train.csv')

In [2]:
## Preprocess

In [17]:
def preprocess(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_es.tokenizer(x) if 
                                                                          tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_pt.tokenizer(x) if
                                                                             tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df[["label","tokens"]]

In [32]:
def preprocess_test(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_es.tokenizer(x) if 
                                                                          tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_pt.tokenizer(x) if
                                                                             tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    return df[["id","tokens"]]

In [18]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [19]:
%time train = parallelize_dataframe(data_raw, preprocess)

CPU times: user 31.6 s, sys: 15.9 s, total: 47.5 s
Wall time: 24min 3s


In [21]:
X_train, X_test, y_train, y_test = train_test_split(train["tokens"], train["label"], test_size=0.05, random_state=42, stratify=train["label"])

train_fasttext = pd.concat([y_train,X_train], axis=1)
test_fasttext = pd.concat([y_test,X_test], axis=1)

In [22]:
train_fasttext.to_csv('../data/train_fasttext.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
test_fasttext.to_csv('../data/test_fasttext.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [32]:
## Training

In [None]:
help(fasttext.FastText)

In [27]:
%time model = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=25, lr=0.5, wordNgrams=2, loss='hs', thread=8)

CPU times: user 1h 34min 25s, sys: 28.4 s, total: 1h 34min 53s
Wall time: 12min 19s


In [28]:
%time model.test('../data/test_fasttext.csv')

CPU times: user 14.1 s, sys: 144 ms, total: 14.2 s
Wall time: 14.1 s


(1000000, 0.820198, 0.820198)

In [29]:
model.predict('bici playera',5)

(('__label__BICYCLES', '__label__STATIONARY_BICYCLES', '__label__SWAY_BARS'),
 array([9.99983549e-01, 1.03018981e-04, 2.20278780e-05]))

In [None]:
## Test

In [30]:
data_test = pd.read_csv('../data/test.csv')

In [33]:
%time test = parallelize_dataframe(data_test, preprocess_test)

CPU times: user 243 ms, sys: 2.42 s, total: 2.67 s
Wall time: 21.5 s


In [65]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('../data/test_fasttext.csv'))

N	1000000
P@1	0.820
R@1	0.820


In [68]:
test["tokens"].to_csv("../data/TEST.txt",index=False,header=False,line_terminator='\n')

In [73]:
predictions = model.predict(test["tokens"].values.tolist())

In [None]:
[x[0] for x in predictions[0]]

In [81]:
data_test.columns

Index(['id', 'title', 'language'], dtype='object')

In [95]:
submission_1 = pd.Series([x[0][9:] for x in predictions[0]])

In [97]:
submission_1.to_csv("submission1.txt")

  """Entry point for launching an IPython kernel.


In [96]:
submission_1

0                                  DIAPER_BAGS
1                           BABY_CHANGING_PADS
2                    ENGINE_COOLING_FAN_MOTORS
3         AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS
4                               BABY_CAR_SEATS
                          ...                 
246950                     VEHICLE_BRAKE_DISCS
246951                          WALKIE_TALKIES
246952                             CALCULATORS
246953                           DINING_TABLES
246954                           WASTE_BASKETS
Length: 246955, dtype: object