In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from spacy.tokenizer import Tokenizer
from sklearn.metrics import balanced_accuracy_score
from multiprocessing import  Pool
import numpy as np
import fasttext
import csv

## Preprocess

In [2]:
# Load Dataset
data_train = pd.read_csv('../data/train.csv',nrows=1000000)
data_test = pd.read_csv('../data/test.csv')

In [3]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [4]:
def preprocess(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_es.tokenizer(x) if 
                                                                          tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_pt.tokenizer(x) if
                                                                             tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df[["label","tokens"]]

In [5]:
def preprocess_test(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_es.tokenizer(x) if 
                                                                          tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(lambda x: ' '.join([tok.text.lower() for tok in nlp_pt.tokenizer(x) if
                                                                             tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1)]))
    return df[["id","tokens"]]

In [6]:
def create_fasttext_split_files(train_df, test_df):
    # train and validation set files
    train = parallelize_dataframe(train_df, preprocess)
    X_train, X_val, y_train, y_val = train_test_split(train["tokens"], train["label"], test_size=0.05, random_state=42, stratify=train["label"])
    train_fasttext = pd.concat([y_train,X_train], axis=1)
    val_fasttext = pd.concat([y_val,X_val], axis=1)
    train_fasttext.to_csv('../data/small/train_fasttext.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    val_fasttext.to_csv('../data/small/val_fasttext.csv',index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")   
    
    #test set file
    test = parallelize_dataframe(test_df, preprocess_test)
    test["tokens"].to_csv("../data/small/test_fasttext.txt",index=False,header=False,line_terminator='\n')

In [8]:
%time create_fasttext_split_files(data_train, data_test)

CPU times: user 9.3 s, sys: 1.32 s, total: 10.6 s
Wall time: 1min 39s


## Training

In [11]:
%time model = fasttext.train_supervised(input="../data/small/train_fasttext.csv", epoch=25, lr=0.5, wordNgrams=2, thread=8)

CPU times: user 3h 14min 18s, sys: 4.97 s, total: 3h 14min 23s
Wall time: 24min 23s


In [12]:
%time model.test('../data/small/val_fasttext.csv')

CPU times: user 13.4 s, sys: 60 ms, total: 13.4 s
Wall time: 13.3 s


(50000, 0.82412, 0.82412)

In [None]:
model.predict('bici playera',5)

In [None]:
model.save_model("../models/small/model1.bin")

## Predict

In [None]:
test_data = pd.read_csv('../data/small/test_fasttext.csv',names=['tokens'])

In [None]:
predictions = model.predict(test["tokens"].values.tolist())

In [None]:
submission = pd.Series([x[0][9:] for x in predictions[0]])

In [None]:
submission.to_csv("./submissions/small/submission1.txt")