In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese

from spacy.tokenizer import Tokenizer


In [27]:
# Load Dataset
data_raw = pd.read_csv('./train_sample.csv',index_col='index')

### Basic statistics

In [None]:
data_raw.dtypes

In [None]:
data_raw.describe()

In [None]:
data_raw.sample(5)

In [None]:
data_raw.label_quality.value_counts()

In [None]:
data_raw.language.value_counts()

In [None]:
data_raw.category.value_counts()

### Preprocessing

#### Tasks:
    1. Lowercase all words
    2. Tokenize
    3. Remove stop words
    4. Remove special characters

In [51]:
nlp_es = Spanish()
nlp_pt = Portuguese()

In [68]:
data = data_raw[0:1000].copy()

In [73]:
data.sample(20)

Unnamed: 0_level_0,title,label_quality,language,category,tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13235265,crique botella c/tope registrable 5 tns.,unreliable,spanish,HYDRAULIC_VEHICLE_JACKS,"[crique, botella, c, /, tope, registrable, 5, ..."
6403881,filtro canister sunsun hw-402a 1000l/h complet...,unreliable,portuguese,AQUARIUM_FILTERS,"[filtro, canister, sunsun, hw-402a, 1000l, /, ..."
7665646,visor 360 realidad virtual,unreliable,spanish,VR_HEADSETS,"[visor, 360, realidad, virtual]"
4526863,adaptador wi fi lg smart tv netgear - original...,unreliable,portuguese,TV_TUNERS,"[adaptador, wi, fi, lg, smart, tv, netgear, -,..."
16421654,interruptor 1 tecla simples stylus ilumi,unreliable,portuguese,ELECTRICAL_OUTLETS,"[interruptor, 1, tecla, simples, stylus, ilumi]"
16083581,granulado sanitário para gatos/areia higiênica,unreliable,portuguese,CATS_LITTER,"[granulado, sanitário, para, gatos, /, areia, ..."
16993866,espectacular biblioteca thompson,unreliable,spanish,BOOKCASES,"[espectacular, biblioteca, thompson]"
390209,kit c/ 2 tesoura ricca aco inox reta,unreliable,portuguese,HAIRDRESSING_SCISSORS,"[kit, c/, 2, tesoura, ricca, aco, inox, reta]"
6581138,pulsador doble con señalización aro cromado baw,unreliable,spanish,CIRCUIT_BREAKERS,"[pulsador, doble, con, señalización, aro, crom..."
9424560,motor de portão basculante 1/4hp bv home - ppa,unreliable,portuguese,GATE_MOTORS,"[motor, de, portão, basculante, 1/4hp, bv, hom..."


In [70]:
# 1. Lowercase
data['title'] = data['title'].str.lower()

In [71]:
mask_spanish    = data["language"] == 'spanish'
mask_portuguese = data["language"] == 'portuguese'

In [72]:
# 2. Tokenize
data.loc[mask_spanish, "tokens"] = data["title"].apply(lambda x: list(nlp_es.tokenizer(x)))
data.loc[mask_portuguese, "tokens"] = data["title"].apply(lambda x: list(nlp_pt.tokenizer(x)))

In [74]:
data['tokens'].apply(lambda x: [item for item in x if item in stopwords])

index
18338098    []
15228812    []
5120904     []
2725410     []
17647653    []
            ..
11431579    []
14438459    []
19002399    []
14857628    []
6853406     []
Name: tokens, Length: 1000, dtype: object

In [91]:
stopwords(0)

TypeError: 'set' object is not callable

In [84]:
for item in data['tokens'].values:
    for i in item:
        
        if i in s:
            print(i)

TypeError: Argument 'other' has incorrect type (expected spacy.tokens.token.Token, got str)

In [77]:
stopwords

{'acerca',
 'actualmente',
 'acuerdo',
 'adelante',
 'ademais',
 'ademas',
 'además',
 'adeus',
 'adrede',
 'afirmó',
 'agora',
 'agregó',
 'ahi',
 'ahora',
 'ahí',
 'ainda',
 'al',
 'algo',
 'algumas',
 'alguna',
 'algunas',
 'alguno',
 'algunos',
 'alguns',
 'algún',
 'ali',
 'alli',
 'allí',
 'alrededor',
 'além',
 'ambas',
 'ambos',
 'ampleamos',
 'antano',
 'antaño',
 'ante',
 'anterior',
 'antes',
 'ao',
 'aos',
 'apenas',
 'apoia',
 'apoio',
 'apontar',
 'aproximadamente',
 'após',
 'aquel',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquella',
 'aquellas',
 'aquello',
 'aquellos',
 'aqui',
 'aquilo',
 'aquél',
 'aquélla',
 'aquéllas',
 'aquéllos',
 'aquí',
 'arriba',
 'arribaabajo',
 'as',
 'aseguró',
 'asi',
 'assim',
 'así',
 'atras',
 'através',
 'atrás',
 'até',
 'aun',
 'aunque',
 'ayer',
 'aí',
 'añadió',
 'aún',
 'baixo',
 'bajo',
 'bastante',
 'bem',
 'bien',
 'boa',
 'bom',
 'breve',
 'buen',
 'buena',
 'buenas',
 'bueno',
 'buenos',
 'cada',
 'caminho',
 'casi',
 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data_raw["title"], data_raw["category"], test_size=0.10, random_state=42)

In [None]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
X_train_tfidf.shape

In [5]:
spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS
spacy_stopwords_pt = spacy.lang.pt.stop_words.STOP_WORDS

In [7]:
print(len(spacy_stopwords))
print(len(spacy_stopwords_pt))
print(type(spacy_stopwords))

551
413
<class 'set'>


In [9]:
stopwords = spacy.lang.es.stop_words.STOP_WORDS.union(spacy.lang.pt.stop_words.STOP_WORDS)
len(stopwords)

887

In [34]:
stopwords

{'acerca',
 'actualmente',
 'acuerdo',
 'adelante',
 'ademais',
 'ademas',
 'además',
 'adeus',
 'adrede',
 'afirmó',
 'agora',
 'agregó',
 'ahi',
 'ahora',
 'ahí',
 'ainda',
 'al',
 'algo',
 'algumas',
 'alguna',
 'algunas',
 'alguno',
 'algunos',
 'alguns',
 'algún',
 'ali',
 'alli',
 'allí',
 'alrededor',
 'além',
 'ambas',
 'ambos',
 'ampleamos',
 'antano',
 'antaño',
 'ante',
 'anterior',
 'antes',
 'ao',
 'aos',
 'apenas',
 'apoia',
 'apoio',
 'apontar',
 'aproximadamente',
 'após',
 'aquel',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquella',
 'aquellas',
 'aquello',
 'aquellos',
 'aqui',
 'aquilo',
 'aquél',
 'aquélla',
 'aquéllas',
 'aquéllos',
 'aquí',
 'arriba',
 'arribaabajo',
 'as',
 'aseguró',
 'asi',
 'assim',
 'así',
 'atras',
 'através',
 'atrás',
 'até',
 'aun',
 'aunque',
 'ayer',
 'aí',
 'añadió',
 'aún',
 'baixo',
 'bajo',
 'bastante',
 'bem',
 'bien',
 'boa',
 'bom',
 'breve',
 'buen',
 'buena',
 'buenas',
 'bueno',
 'buenos',
 'cada',
 'caminho',
 'casi',
 

### Classifiers

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tfidf, y_train)

In [None]:
#SPACY
nlp = Spanish()

In [None]:
nlp

In [None]:
test = data_raw[:50]['title'].values

In [None]:
test

In [None]:
my_doc = nlp(test)

# Create list of word tokens
token_list = []

for token in my_doc:
    token_list.append(token.text)
print(token_list)