In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import xgboost, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

  from numpy.core.umath_tests import inner1d
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [41]:
currentdir = os.getcwd()
input_data = pd.read_csv(os.path.join(currentdir, r'ndsc-beginner\train.csv'))
input_data.head(5)

test = pd.read_csv(os.path.join(currentdir, r'ndsc-beginner\test.csv'))
test.head(5)

Unnamed: 0,itemid,title,image_path
0,370855998,flormar 7 white cream bb spf 30 40ml,beauty_image/1588591395c5a254bab84042005f2a9f.jpg
1,637234604,maybelline clear smooth all in one bb cream sp...,beauty_image/920985ed9587ea20f58686ea74e20f93.jpg
2,690282890,murah innisfree eco natural green tea bb cream...,beauty_image/90b40e5710f54352b243fcfb0f5d1d7f.jpg
3,930913462,loreal white perfect day cream spf 17 pa white...,beauty_image/289c668ef3d70e1d929d602d52d5d78a.jpg
4,1039280071,hada labo cc cream ultimate anti aging spf 35 ...,beauty_image/d5b3e652c5822d2306f4560488ec30c6.jpg


In [29]:
## Dataset preparation ###

# => cannot remove punctuations, stopwords (could remove words like 'make up'), numbers

train = input_data[['title', 'Category']]
print(train.shape)
train.head()

test_x = test['title']
print(test_x.shape)

(666615, 2)
(172402,)


0                 flormar 7 white cream bb spf 30 40ml
1    maybelline clear smooth all in one bb cream sp...
2    murah innisfree eco natural green tea bb cream...
3    loreal white perfect day cream spf 17 pa white...
4    hada labo cc cream ultimate anti aging spf 35 ...
Name: title, dtype: object

In [5]:
#Split training data to obtain validation data
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['title'], train['Category'])

# encoding/converting the text labels to int value (some funcs can only accept int values in sklearn)
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [30]:
## Feature Engineering ##

# 1 Count Vector - matrix with rows=terms, cols=each document & cell=count of terms in each doucument
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train['title'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)
xtest_count =  count_vect.transform(test_x)

xtrain_count.shape #499961 rows/documents, 80091 words as columns

(499961, 80091)

In [8]:
# 2 TFIDF Vector 
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['title'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train['title'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

print(xtrain_tfidf.shape)
print(xtrain_tfidf_ngram.shape)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(499961, 5000)
(499961, 5000)


In [13]:
## Model building ##

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [14]:
#1 Naive Bayes classifier
# NB Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# NB Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# NB on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

NB, Count Vectors:  0.6627443685720115
NB, WordLevel TF-IDF:  0.6592341017917361
NB, N-Gram Vectors:  0.5842524031826419


In [18]:
nbcount_model = naive_bayes.MultinomialNB().fit(xtrain_count,train_y)
nbcount_predictions = nbcount_model.predict(xvalid_count)
nbcount_accuracy = metrics.accuracy_score(nbcount_predictions, valid_y)
print(nbcount_accuracy)

0.6627443685720115


In [31]:
nbcount_model = naive_bayes.MultinomialNB().fit(xtrain_count,train_y)
test_label = nbcount_model.predict(xtest_count)

In [42]:
test['Category'] = test_label
submission = test[['itemid', 'Category']]
submission.to_csv('submission.csv', index=False)