In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import classification_report
from sklearn import svm 

In [2]:
df=pd.read_csv("/Users/yuesongyang/Desktop/NDSC/train.csv")

In [3]:
df.shape

(666615, 4)

In [4]:
# change data type to categorical
df['Category'] = df['Category'].astype('category')

In [5]:
df.dtypes

itemid           int64
title           object
Category      category
image_path      object
dtype: object

In [None]:
# sample some rows for training, dataset too big to train locally
train_df=df.sample(frac=0.005)
# sample validation data set
test_df=df.sample(frac=0.003)

title=df["title"]
title.head(5)

df['tokenized'] = df['title'].apply(word_tokenize) 

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# count the number of labels in train set
df.Category.unique()

In [None]:
# count the number of labels in validation set
train_df.Category.unique()

In [None]:
# get features and labels
train_data=train_df.title
train_label=train_df.Category
test_data=test_df.title
test_label=test_df.Category

In [None]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)
# Train the feature vectors
train_vectors = vectorizer.fit_transform(train_data)
# Apply model on validation data 
test_vectors = vectorizer.transform(test_data)

# Perform classification with SVM, kernel=linear 
model = svm.SVC(kernel='linear') 
model.fit(train_vectors, train_label) 
prediction = model.predict(test_vectors)

In [None]:
# print results
print (classification_report(test_label, prediction))

In [None]:
df2=pd.read_csv("/Users/yuesongyang/Desktop/NDSC/test.csv")

In [None]:
df2.head(5)

In [None]:
df2.shape

In [None]:
df2_train_data=df2.title

In [None]:
# preduct the test set
df2_test_vectors = vectorizer.transform(df2_train_data)
prediction = model.predict(df2_test_vectors)

In [None]:
print(prediction[0:10])

In [None]:
df2["Category"]=prediction

In [None]:
df2.head(5)

In [None]:
# configure the structure as required
result= df2[['itemid','Category']]

In [None]:
result.head(10)

In [None]:
result.Category.unique()

In [None]:
result.to_csv('submission.csv', index=False)

In [25]:
# sample some rows for training, dataset too big to train locally
train_df=df.sample(frac=0.0005)
# sample validation data set
test_df=df.sample(frac=0.00003)

In [26]:
train_df.head()

Unnamed: 0,itemid,title,Category,image_path
88818,1786261325,dijual cepat bb cushion wardah,4,beauty_image/318966bcc1567397ede9c97638d1adb3.jpg
327863,1511974545,beauty s dress tanpa lengan motif print natal ...,18,fashion_image/6ca504a75b74bcb7f1fdc5eb4531beac
136097,2575044,etude house cc cream,5,beauty_image/22c145eb30e95ac5026a084e47e7f431.jpg
20294,1575207543,natural green raspberry cleansing oil,1,beauty_image/2085e20907a53725badc5c8de83391a6.jpg
625066,844263048,terlaris iphone 6 32gb garansi ibox new,35,mobile_image/95e4bd9ed63a74b16200ec458d2a8b76.jpg


In [27]:
feature=train_df['title'].tolist()

In [28]:
len(feature)

333

In [29]:
label=train_df['Category'].tolist()

In [30]:
training_corpus=[]
for i in range (0,len(label)):
    training_corpus.append((feature[i],label[i]))

In [31]:
training_corpus

[('dijual cepat bb cushion wardah', 4),
 ('beauty s dress tanpa lengan motif print natal ada ukuran besar', 18),
 ('etude house cc cream', 5),
 ('natural green raspberry cleansing oil', 1),
 ('terlaris iphone 6 32gb garansi ibox new', 35),
 ('dress maxi v neck lengan pendek motif bunga aksen tali pinggang untuk wanita',
  20),
 ('dress wanita model off shoulder bahan sifon versi korea untuk pantai', 21),
 ('gaun midi casual wanita lengan pendek dengan potongan a line motif floral dan bergaya retro',
  18),
 ('dress midi bodycon bandage sexy elegan lengan panjang elastis', 18),
 ('mirabella chic twcj', 3),
 ('kertas minyak clean clear kode el0836 za', 4),
 ('ready iphone 6 64 gb garansi distributor', 31),
 ('kusus hari ini laneige bb cushion pore control spf 50+pa diskon', 5),
 ('samsung j6 garansi resmi', 35),
 ('bless face powder beige 25 gr', 3),
 ('iphone 7 32gb mulus seken original bergaransi', 31),
 ('promo diskon murah oppo f7 pro ram 6gb rom 128gb garansi resmi bisa all indonesi

In [32]:
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob

In [33]:
model = NBC(training_corpus) 

In [34]:
test_df_data=test_df['title'].tolist()

In [35]:
test_df_label=test_df['Category'].tolist()

In [36]:
validation_corpus=[]
for i in range (0,len(test_df_label)):
    validation_corpus.append((test_df_data[i],test_df_label[i]))

In [37]:
validation_corpus

[('natural 99 king set whitening asli grosir', 4),
 ('nokia dual sim 130 putih', 38),
 ('xiaomi redmi a2 lite 3 32 garansi resmi tam', 34),
 ('multiflora women long maxi dresses bohemia v neck three quarter sleeve floral',
  18),
 ('harga termurah eos 214 brown normal only', 4),
 ('ready termurah lcd iphone x ori new oled screen', 35),
 ('grosir viva queen refill compact powder', 3),
 ('hot sale inez 900 lustrous pressed powder bedak padat exclusive paling unik',
  3),
 ('iphone 4 cdma front camera module', 31),
 ('csz 03 lakme 9 to 5 reinvent p+m compact powder foundation', 3),
 ('bioaqua perfect make up water beautiful light loose powder 02', 3),
 ('women bandage bodycon slim sleeveless evening party cocktail pencil mini dress',
  22),
 ('ertos erto s baked powder original bedak', 3),
 ('dress midi slim fit lengan pendek desain patchwork renda motif natal santa reindeer',
  18),
 ('ready bioaqua soothing gel aloe vera 92 160ml', 4),
 ('nyx soft matte lip cream ory', 12),
 ('atasan bl

In [38]:
for i in range (0,len(validation_corpus)):
    print(model.classify(validation_corpus[i][0]))

3
3
3
18
3
3
3
3
3
3
3
18
3
18
3
5
18
18
18
3
