In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from spacy.lang.es import Spanish

In [2]:
# Load Dataset
data_raw = pd.read_csv('./train_sample.csv',index_col='index')

### Basic statistics

In [4]:
data_raw.dtypes

title            object
label_quality    object
language         object
category         object
dtype: object

In [6]:
data_raw.describe()

Unnamed: 0,title,label_quality,language,category
count,1000000,1000000,1000000,1000000
unique,999975,2,2,1588
top,Bateria Motorola,unreliable,portuguese,PANTS
freq,2,940691,500258,1867


In [7]:
data_raw.sample(5)

Unnamed: 0_level_0,title,label_quality,language,category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8084741,Sillón Cama De 1 Plaza Con Colchón,unreliable,spanish,HEADBOARDS
878981,Lifeproof Fre Series Caso Impermeable Para Sam...,unreliable,spanish,CELLPHONE_TABLET_AND_GPS_SCREEN_PROTECTORS
12875359,Servilletero Portaservileta De Madera Estilo V...,unreliable,spanish,NAPKIN_HOLDERS
4289716,Kit Xenon Moto Yamaha Ybr 125 Lampada 8000k H4...,unreliable,portuguese,XENON_KITS
3382783,Modem Arescom 800 Con Salida Usb Y Red,unreliable,spanish,MODEMS


In [85]:
data_raw.label_quality.value_counts()

unreliable    940691
reliable       59309
Name: label_quality, dtype: int64

In [87]:
data_raw.language.value_counts()

portuguese    500258
spanish       499742
Name: language, dtype: int64

In [10]:
data_raw.category.value_counts()

PANTS                                   1867
COFFEE_MAKERS                           1782
MUSICAL_KEYBOARDS                       1679
BABY_CAR_SEATS                          1671
MATTRESSES                              1660
                                        ... 
FORCE_GAUGES                               9
HONEY_EXTRACTORS                           8
COLD_FOOD_AND_DRINK_VENDING_MACHINES       7
COMMERCIAL_POPCORN_MACHINES                7
HAMBURGER_FORMERS                          3
Name: category, Length: 1588, dtype: int64

### Preprocessing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data_raw["title"], data_raw["category"], test_size=0.10, random_state=42)

In [6]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
X_train_tfidf.shape

(900000, 325956)

### Classifiers

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tfidf, y_train)

In [18]:
#SPACY
nlp = Spanish()

In [19]:
nlp

<spacy.lang.es.Spanish at 0x7fa35c08bc90>

In [93]:
test = data_raw[:50]['title'].values

In [94]:
test

array(['Chaleco De Pesca Talle M C/manchas Ver Fotos. Cordoba',
       'Mocos Nasal Seguro Higinico Limpiador Deaspirador',
       'Mascara De Pestañas Extra Volumen 3 En 1 Ricosti',
       'Batidora De Mano Oster 2532 6 Velocidades 250 W Garantia ',
       'Kirkland Signature Pañales Supremo Tamaño 5, 150 Cantidad',
       'Balança Digital Upx 15 Kg',
       'Nexgard Pastilla Para Pulgas Y Garrapatas Hasta 25kg.  ',
       'Vara De Pesca Reel Combo Telesc\x9d\x9dpica P\x9d\x9dlo De Pesca De Fibr',
       'Serpentina Condensador Carrier 18000 Btuh',
       'Kit 10 Carimbos Premium 60 (só Carcaça)',
       'Braco Curvo Suspensao Traseira Esquerda Bmw 550 2011 **5626',
       'Pantalla Gigante Con Trípode  Loch 100 Pulg  2.00 X 1.50',
       'Colete Salva Vidas Xfloat Infantil Monster',
       'Navegador Gps Automotivo Mercedes C250 Tela 4.3 Touch Voz Tv',
       'Mochila Hidratante Camelback Incluye Bolsa De Agua 2.5 L ',
       'Cámara De Fotos Powershot Canon Sx40 En Excelente Estado'

In [67]:
my_doc = nlp(test)

# Create list of word tokens
token_list = []

for token in my_doc:
    token_list.append(token.text)
print(token_list)

['Chaleco', 'De', 'Pesca', 'Talle', 'M', 'C', '/', 'manchas', 'Ver', 'Fotos', '.', 'Cordoba']


ValueError: [E029] noun_chunks requires the dependency parse, which requires a statistical model to be installed and loaded. For more info, see the documentation:
https://spacy.io/usage/models