In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from spacy.tokenizer import Tokenizer
from sklearn.metrics import balanced_accuracy_score
from multiprocessing import  Pool
import numpy as np
import fasttext
import csv
import unicodedata

In [145]:
pd.set_option('display.max_rows', 200)

## Preprocess

In [3]:
# Load Dataset
data_train = pd.read_csv('../data/train.csv')
data_test = pd.read_csv('../data/test.csv')

In [9]:
def normalize_text(text,nlp):
    s = []
    for tok in nlp.tokenizer(text):
        if tok.is_alpha and not (tok.is_digit or tok.is_stop or len(tok.text) == 1):
            if not tok.is_ascii:
                tok = ''.join(c for c in unicodedata.normalize('NFD', tok.text.lower()) if unicodedata.category(c) != 'Mn')
                s.append(tok)
            else:
                s.append(tok.text.lower())
    s = ' '.join(s)
    return s

In [10]:
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [11]:
def preprocess(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df[["label","tokens"]]

In [12]:
def preprocess_test(df):
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    return df[["id","tokens"]]

In [13]:
def create_fasttext_split_files(train_df, test_df, outputfiles):
    # train and validation set files
    train = parallelize_dataframe(train_df, preprocess)
    X_train, X_val, y_train, y_val = train_test_split(train["tokens"], train["label"], test_size=0.05, random_state=42, stratify=train["label"])
    train_fasttext = pd.concat([y_train,X_train], axis=1)
    val_fasttext = pd.concat([y_val,X_val], axis=1)
    train_fasttext.to_csv(outputfiles[0],index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    val_fasttext.to_csv(outputfiles[1],index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")   
    
    #test set file
    test = parallelize_dataframe(test_df, preprocess_test)
    test["tokens"].to_csv(outputfiles[2],index=False,header=False,line_terminator='\n')

In [10]:
%time create_fasttext_split_files(data_train, data_test)

CPU times: user 3min 38s, sys: 21.1 s, total: 3min 59s
Wall time: 26min 7s


## Training

In [2]:
%time model = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=5, lr=0.5, wordNgrams=2, thread=8)

CPU times: user 12h 55min 58s, sys: 22.4 s, total: 12h 56min 20s
Wall time: 1h 37min 34s


In [3]:
%time model.test('../data/val_fasttext.csv')

CPU times: user 4min 26s, sys: 1.38 s, total: 4min 28s
Wall time: 4min 25s


(1000000, 0.881847, 0.881847)

In [None]:
model.predict('bici playera',5)

In [4]:
model.save_model("../models/model2.bin")

In [None]:
## Test

In [None]:
[x[0] for x in predictions[0]]

In [None]:
data_test.columns

In [None]:
%time model2 = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=10, lr=0.5, wordNgrams=2, loss='hs', thread=8)

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model2.test('../data/test_fasttext.csv'))

In [None]:
%time model2 = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=5, lr=0.5, wordNgrams=2, loss='hs', thread=8)

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model2.test('../data/test_fasttext.csv'))

In [None]:
%time model3 = fasttext.train_supervised(input="../data/train_fasttext.csv", epoch=5, lr=0.8, wordNgrams=2, loss='hs', thread=8)

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model2.test('../data/test_fasttext.csv'))

## Predict

In [18]:
model = fasttext.load_model("../models/model_norm1.bin")




In [14]:
test_data = pd.read_csv('../data/test_fasttext_norm.txt',names=['tokens'])

In [15]:
test_data.replace(np.nan, 'notitle',inplace=True)

In [19]:
%time predictions = model.predict(test_data["tokens"].values.tolist())

CPU times: user 1min 6s, sys: 164 ms, total: 1min 6s
Wall time: 1min 6s


In [20]:
submission = pd.Series([x[0][9:] for x in predictions[0]])

In [24]:
submission.to_csv("./submissions/submission_test.csv",header=["category"],index_label="id")

In [22]:
submission

0                                  DIAPER_BAGS
1                           BABY_CHANGING_PADS
2                    ENGINE_COOLING_FAN_MOTORS
3         AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS
4                               BABY_CAR_SEATS
                          ...                 
246950                     VEHICLE_BRAKE_DISCS
246951                          WALKIE_TALKIES
246952                             CALCULATORS
246953                             DINING_SETS
246954                           WASTE_BASKETS
Length: 246955, dtype: object

## Models By Language

In [26]:
data_train.sample()

Unnamed: 0,title,label_quality,language,category
14794562,Vital Cat V43 Overweight X 750 Gr Mascota Food,unreliable,spanish,CATS_AND_DOGS_FOODS


In [4]:
mask_spanish    = data_train["language"] == 'spanish'
mask_portuguese = data_train["language"] == 'portuguese'

In [5]:
spanish_df = data_train[data_train["language"] == 'spanish']
portuguese_df = data_train[data_train["language"] == 'portuguese']

In [6]:
spanish_test = data_test[data_test["language"] == 'spanish']
portuguese_test = data_test[data_test["language"] == 'portuguese']

In [7]:
print(len(portuguese_df),len(spanish_df))

10000000 10000000


In [14]:
%time create_fasttext_split_files(spanish_df, spanish_test, ["../data/train_fasttext_spanish_norm.csv","../data/val_fasttext_spanish_norm.csv","../data/test_fasttext_spanish_norm.csv"])

CPU times: user 1min 35s, sys: 12.1 s, total: 1min 47s
Wall time: 12min 3s


In [15]:
%time create_fasttext_split_files(portuguese_df, portuguese_test, ["../data/train_fasttext_portuguese_norm.csv","../data/val_fasttext_portuguese_norm.csv","../data/test_fasttext_portuguese_norm.csv"])

CPU times: user 1min 46s, sys: 11.8 s, total: 1min 58s
Wall time: 11min 26s


In [23]:
data_test = pd.read_csv('../data/test.csv')

In [28]:
data_test[3:4]["id"].values[0]

3

In [40]:
model_es = fasttext.load_model("../models/model_spanish_norm.bin")
model_pt = fasttext.load_model("../models/model_portuguese_norm.bin")





In [47]:
model_pt.predict("testing")[0][0]

'__label__DIGITAL_PORTABLE_MEDIA_PLAYERS'

In [68]:
nlp_es = Spanish()
nlp_pt = Portuguese()
from collections import defaultdict

data = {"id": [], "category": []}

for index, row in data_test.iterrows():
    if row["language"] == 'portuguese':
        title = normalize_text(row["title"],nlp_pt)
        category = model_pt.predict(title)[0][0]
    if row["language"] == 'spanish':
        title = normalize_text(row["title"],nlp_es)
        category = model_es.predict(title)[0][0]
    data["id"].append(row["id"])
    data["category"].append(category[9:])
        

In [69]:
submission = pd.DataFrame.from_dict(data)

In [70]:
submission

Unnamed: 0,id,category
0,0,DIAPER_BAGS
1,1,BABY_CHANGING_PADS
2,2,ENGINE_COOLING_FAN_MOTORS
3,3,AUTOMOTIVE_SHOCK_ABSORBER_BUMP_STOPS
4,4,BABY_CAR_SEATS
...,...,...
246950,246950,VEHICLE_BRAKE_DISCS
246951,246951,WALKIE_TALKIES
246952,246952,CALCULATORS
246953,246953,DINING_SETS


In [71]:
submission.to_csv("./submissions/submission_multiple_lang_1.csv",header=["id","category"],index=False)

## Error Analysis

In [74]:
best_model=fasttext.load_model("../models/model_norm1.bin")




In [105]:
val_data = pd.read_csv("../data/val_fasttext_norm.csv",nrows=200000,header=None,names=["tokens"])

In [106]:
val_data["category"] = val_data["tokens"].apply(lambda x: x.split()[0][9:])
val_data["title"] = val_data["tokens"].apply(lambda x:  ' '.join(x.split()[1:]))

In [114]:
val_data["predicted"] = ''
val_data["score"] = ''

In [136]:
predictions = {"id": [], "tokens": [], "category": [], "predicted": [], "score": []}

for index, row in val_data.iterrows():
    result = best_model.predict(row["tokens"])
    predictions["id"].append(index)
    predictions["tokens"].append(row["tokens"])
    predictions["category"].append(row["category"])
    predictions["predicted"].append(result[0][0][9:])
    predictions["score"].append(result[1][0])

prediction_data = pd.DataFrame.from_dict(predictions)

In [None]:
prediction_data

In [None]:
data_train = pd.read_csv('../data/train.csv')

In [198]:
X_train, X_val, y_train, y_val = train_test_split(data_train[["title","label_quality","language"]], data_train["category"], test_size=0.05, random_state=42, stratify=data_train["category"])

In [200]:
#prediction_data["original_title"] = X_val[:200000]["title"].values
#prediction_data["label_quality"] = X_val[:200000]["label_quality"].values
prediction_data["language"] = X_val[:200000]["language"].values

In [None]:
prediction_data 

In [189]:
prediction_data.rename(columns={'title':'tokens'}, inplace=True)

In [212]:
errors_data = prediction_data[prediction_data["category"] != prediction_data["predicted"]][["id","original_title","tokens","category","predicted","label_quality","language","score"]].copy()

In [206]:
len(errors_data["category"].unique())

1556

In [205]:
errors_data["language"].value_counts(normalize=True)

spanish       0.507781
portuguese    0.492219
Name: language, dtype: float64

In [None]:
errors_data.head(100)[["original_title","tokens","category","predicted","label_quality"]]

In [207]:
data_train["label_quality"].value_counts(normalize=True)

unreliable    0.940788
reliable      0.059212
Name: label_quality, dtype: float64

In [208]:
errors_data["label_quality"].value_counts(normalize=True)

unreliable    0.960193
reliable      0.039807
Name: label_quality, dtype: float64

In [213]:
errors_data.sample(10)[["original_title","tokens","category","predicted","label_quality","score"]]

Unnamed: 0,original_title,tokens,category,predicted,label_quality,score
120636,Coxim Axial Bucha Pivo Rolamento Da Roda Fies...,coxim axial bucha pivo rolamento da roda fiest...,SUSPENSION_BALL_JOINTS,SHOCK_MOUNT_INSOLATORS,unreliable,0.363446
186122,Correntinha + Pingente Folheado A Ouro 46 Cm,correntinha pingente folheado ouro cm,NECKLACES,CHARMS_AND_MEDALS,unreliable,0.939457
4115,Pés De Mesa / Aparador ( Monte Facil ),pes de mesa aparador monte facil,STOOLS,TV_AND_MONITOR_MOUNTS,unreliable,0.305697
60545,Tubo Do Escapamento Motorm Sailor Yamaha 15hp 2t,tubo do escapamento motorm sailor yamaha,MEMORY_CARDS,MOTORCYCLE_EXHAUSTS,unreliable,0.830055
158944,Lote X 10 Codos Tigre 25 Mm A 45 Grados,lote codos tigre mm grados,CONNECTING_COUPLERS,PIPES_AND_TUBES,unreliable,0.845858
8472,Eixo Bmw 116i - 2014 - Sucata Peças,eixo bmw sucata pecas,AUTOMOTIVE_FRONT_BUMPERS,REAR_WHEEL_HUBS_BEARING_ASSEMBLY,unreliable,0.996451
128025,Fix-30p-d8 Baja Puntuación De La Pantalla De L...,baja puntuacion de la pantalla de la linea de,LCD_DISPLAYS,LAPTOP_LCD_SCREENS,unreliable,0.04392
39842,Niebla Fabricante 12 Led Fogger Niebla Fuente ...,niebla fabricante led fogger niebla fuente agu...,DEHUMIDIFIERS,INSECTICIDES,unreliable,0.069133
44770,Borracha Do Bojo Do Farol Olho De Boi E Parala...,borracha do bojo do farol olho de boi paralama...,AUTOMOTIVE_EMBLEMS,AUTOMOTIVE_WEATHERSTRIPS,reliable,0.596206
35632,Valv Agulha Weber 40,valv agulha weber,TURNTABLE_NEEDLES,CAR_CARBURETORS,unreliable,0.961341


In [None]:
errors_data.tail(10)[["original_title","tokens","category","predicted","label_quality"]]