In [157]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from spacy.lang.es import Spanish
from spacy.lang.pt import Portuguese
from sklearn.metrics import balanced_accuracy_score
from spacy.tokenizer import Tokenizer
from sklearn.svm import LinearSVC

In [27]:
# Load Dataset
data_raw = pd.read_csv('./train_sample.csv',index_col='index')

### Basic statistics

In [None]:
data_raw.dtypes

In [None]:
data_raw.describe()

In [None]:
data_raw.sample(5)

In [None]:
data_raw.label_quality.value_counts()

In [None]:
data_raw.language.value_counts()

In [None]:
data_raw.category.value_counts()

### Preprocessing

#### Tasks:
    1. Lowercase all words
    2. Tokenize
    3. Remove stop words
    4. Remove special characters

In [51]:
nlp_es = Spanish()
nlp_pt = Portuguese()

In [175]:
#data = data_raw[0:100000].copy()
data = data_raw.copy()

In [176]:
data.sample(20)

Unnamed: 0_level_0,title,label_quality,language,category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5764698,Cilindro O Bombin De Embrague De Isuzu,unreliable,spanish,CLUTCH_SLAVE_CYLINDERS
1894117,Paleta De Padel Souler Troyer,unreliable,spanish,PADDLE_TENNIS_RACKETS
7837243,Ap Out.ubiquiti Rocket Ac Prism 29,unreliable,spanish,WIRELESS_ANTENNAS
3198224,Touca De Nylon Descartável Talge Preta Pct Co...,unreliable,portuguese,HAIRDRESSING_CAPS
15357805,Microondas Gourmet,unreliable,portuguese,MICROWAVES
87799,Carimbo Em Borracha Litoarte Clp-024 Cantoneir...,unreliable,portuguese,RUBBER_STAMPS
12253466,Parafusadeira Pneumatica 1/2 Com Jogo De Soque...,unreliable,portuguese,IMPACT_WRENCHES
15907143,Placa T-con 6870c-0442b Panasonic/philips/toshiba,unreliable,portuguese,TV_SMPS
586701,Mix Sin Pasas De Uva! 80grs,unreliable,spanish,DRIED_FRUITS
16113309,Canillera Nike Charge 2019731-dx,unreliable,spanish,SHIN_GUARDS


In [177]:
# 1. Lowercase
data['title'] = data['title'].str.lower()

In [167]:
# 2. Tokenize
# 3. Remove Stopwords & Punctuation

In [178]:
mask_spanish    = data["language"] == 'spanish'
mask_portuguese = data["language"] == 'portuguese'

In [179]:
data.loc[mask_spanish, "tokens"] = data["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_es.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))
data.loc[mask_portuguese, "tokens"] = data["title"].apply(lambda x: ' '.join([tok.text for tok in nlp_pt.tokenizer(x) if not (tok.is_punct or tok.is_stop)]))

In [180]:
X_train, X_test, y_train, y_test = train_test_split(data["tokens"], data["category"], test_size=0.10, random_state=42)

In [181]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
X_train_tfidf.shape

(900000, 321011)

### Classifiers

In [154]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tfidf, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [155]:
yTrainPredict = clf.predict(X_train_tfidf)
yPrediction = clf.predict(X_test_tfidf)
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_test, yPrediction))

Balanced Accuracy Score: 1.00
Balanced Accuracy Score: 0.32




In [159]:
clf_svm = LinearSVC()

In [None]:
clf_svm.fit(X_train_tfidf, y_train)

In [None]:
#TrainPredict = clf_svm.predict(X_train_tfidf)
yPrediction = clf_svm.predict(X_test_tfidf)
#print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_test, yPrediction))