## 1. Data Spliting

In [1]:
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

import matplotlib.pyplot as plt
import os
import numpy as np

In [2]:
#The samples in test_csv will not be used for training & validation
train_csv=pd.read_csv('Data/train.csv')
train_csv.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [3]:
#The samples in test_csv will not be used during training
test_csv=pd.read_csv('Data/test.csv')
test_csv.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [4]:
#Count every label_group
train_csv.label_group.value_counts().reset_index().rename(columns={'index':'label_group','label_group':'count'}).head()

Unnamed: 0,label_group,count
0,994676122,51
1,1163569239,51
2,1141798720,51
3,159351600,51
4,562358068,51


In [5]:
#Only sample those label_group which has more than 20 samples
test_ratio=0.1
label_count=train_csv.label_group.value_counts().reset_index().rename(columns={'index':'label_group','label_group':'count'})
sample_num={}

for i in range(label_count.shape[0]):
    num_to_sample=label_count['count'].iloc[i]*test_ratio
    if num_to_sample<2:continue
    sample_num[label_count['label_group'].iloc[i]]=int(num_to_sample)

sample_num

{994676122: 5,
 1163569239: 5,
 1141798720: 5,
 159351600: 5,
 562358068: 5,
 3113678103: 5,
 3627744656: 5,
 3206118280: 4,
 1733221456: 4,
 1166650192: 4,
 1091404026: 4,
 3489985175: 4,
 1237550763: 3,
 1746655739: 3,
 452508504: 3,
 731330024: 3,
 4184214313: 3,
 2008989859: 3,
 1575763518: 3,
 66247839: 3,
 871679006: 3,
 952183003: 3,
 2259957740: 3,
 3001123709: 3,
 2999794436: 3,
 2935813666: 3,
 3926100920: 3,
 656698835: 3,
 1226500780: 3,
 997220911: 3,
 1201602115: 2,
 821583868: 2,
 3717044186: 2,
 3868183614: 2,
 1135976474: 2,
 3326267479: 2,
 3040690230: 2,
 927285629: 2,
 1306578136: 2,
 2156459496: 2,
 4141124289: 2,
 2123332638: 2,
 418991059: 2,
 1569494229: 2,
 3441184770: 2,
 777596345: 2,
 1744240905: 2,
 2956941947: 2,
 1065450055: 2,
 1544174053: 2,
 1088754866: 2,
 2911646536: 2,
 4277487223: 2,
 3433277712: 2,
 962477933: 2,
 1285119273: 2,
 4038613836: 2,
 2748623227: 2}

In [6]:
valid_idx=[list(np.where(train_csv['label_group']==label)[0][:sample_num[label]]) for label in sample_num]
valid_idx=[idx for group in valid_idx for idx in group]

train_idx=[i for i in range(train_csv.shape[0]) if i not in valid_idx]

print('TrainingSet Size:{}'.format(len(train_idx)))
print('ValidationSet Size:{}'.format(len(valid_idx)))

TrainingSet Size:34085
ValidationSet Size:165


In [7]:
X_train,y_train=train_csv.iloc[train_idx,:-1],train_csv.iloc[train_idx,-1]
X_valid,y_valid=train_csv.iloc[valid_idx,:-1],train_csv.iloc[valid_idx,-1]

## 2. Data Cleaning

In [25]:
# Code Reference: https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76
import re
import nltk
import string
from nltk.corpus import stopwords
# # In case of any corpus are missing 
stop_words = stopwords.words("english")
def text_preproc(x):
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

In [26]:
def clean_apply(docs):
    for i, doc in enumerate(docs):
        docs[i] = text_preproc(doc)

In [30]:
train_text = np.array(X_train['title'])

In [33]:
test_text = np.array(X_valid['title'])

In [31]:
clean_apply(train_text)

In [32]:
train_text

array(['paper bag victoria secret',
       'double tape vhb mm x original double foam tape',
       'maling tts canned pork luncheon meat gr', ...,
       'khanzaacc robot subwoofer bass metal wired headset',
       'kaldu non msg halal mama kamu ayam kampung sapi lokal jamur bkn alsultan biocell ',
       'flex tape pelapis bocor isolasi ajaib anti bocor'], dtype=object)

In [34]:
clean_apply(test_text)

#### Concatenation

In [36]:
train_data = np.c_[train_text,np.array(y_train)]

In [37]:
train_data.shape

(34085, 2)

In [38]:
test_data = np.c_[test_text,np.array(y_valid)]

In [39]:
test_data.shape

(165, 2)

In [178]:
train_data[:5,:]

array([['paper bag victoria secret', 249114794],
       ['double tape vhb mm x original double foam tape', 2937985045],
       ['maling tts canned pork luncheon meat gr', 2395904891],
       ['daster batik lengan pendek motif acak campur leher kancing batik karakter alhadi',
        4093212188],
       ['nescafe latte ', 3648931069]], dtype=object)

In [179]:
test_data[:5,:]

array([['karet kucir premium', 994676122],
       [' karet ikat rambut elastis untuk wanita', 994676122],
       [' pcs ikat rambut karet polos elastis gaya korea untuk wanita',
        994676122],
       [' pcs ikat rambut korea karet polos elastis gaya korea untuk wanita',
        994676122],
       ['korea women children hair tie head rope karet gelang elastis',
        994676122]], dtype=object)

## 3. Vectorizing: Self-trained document embedding

In [42]:
# Vectorizing
# Code Reference: ANLY 580 Natural Language Processing Lab5
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

K = 64
word_frequency_threshold = 2
epochs = 10
lr = 0.01
model = Doc2Vec(vector_size=K, min_count=word_frequency_threshold, epochs=epochs)
text_list = []
for i, line in enumerate(train_data[:,0]):
    tokens = gensim.utils.simple_preprocess(line)
    text_list.append(gensim.models.doc2vec.TaggedDocument(tokens, [i]))
model.build_vocab(text_list)
model.train(text_list, total_examples=model.corpus_count, epochs=model.epochs)

In [43]:
# train_embedding_vector
embedding_vector = np.array([model.infer_vector(i.split()) for i in train_data[:,0]])

In [44]:
embedding_vector.shape

(34085, 64)

In [48]:
# show the first three documents/titles embedding vector
embedding_vector[:3,:]

array([[ 0.00122813, -0.02922335,  0.01609793,  0.0035479 ,  0.0135329 ,
         0.00858887,  0.00972958, -0.01786955, -0.00997095, -0.00200482,
        -0.02523364, -0.0041339 , -0.02168355, -0.00059627, -0.01850777,
         0.03147927, -0.01674778, -0.0033539 , -0.04341891,  0.0438098 ,
         0.03923074,  0.03614574, -0.02088674, -0.05020263, -0.04269053,
         0.01221717, -0.06456788,  0.00186756,  0.0286314 , -0.03083886,
         0.0176802 , -0.01986532, -0.00576735, -0.01072335, -0.03602539,
         0.06110099,  0.03023976, -0.04990058,  0.01029437,  0.0385104 ,
         0.03461859, -0.02080715,  0.02517138, -0.03622035,  0.02491518,
         0.01342856,  0.03866169, -0.0948117 ,  0.02131087,  0.00050993,
         0.00973766, -0.01686111, -0.00440951,  0.0214761 ,  0.02242767,
        -0.0215011 , -0.00707173, -0.0392006 , -0.01723917, -0.01290667,
        -0.00744743, -0.02927901, -0.02212649, -0.00427793],
       [ 0.0415769 , -0.06845216,  0.07252624,  0.01788334,  0.

In [160]:
# test_embedding_vector
embedding_vector_test = np.array([model.infer_vector(i.split()) for i in test_data[:,0]])

In [161]:
embedding_vector_test.shape

(165, 64)

## 4. Find Similar items based on embedding space

In [129]:
def find_similar_product(product_title_description, top_n=11):
    doc_vector = model.infer_vector(product_title_description)
    sims = model.dv.most_similar([doc_vector], topn=top_n)
    return sims

In [51]:
train_data[0,0].split()

['paper', 'bag', 'victoria', 'secret']

In [132]:
find_similar_product(train_data[0,0].split())

[(0, 0.8270717263221741),
 (4860, 0.7933609485626221),
 (32996, 0.7849490642547607),
 (5189, 0.7684443593025208),
 (21839, 0.7584372162818909),
 (26230, 0.7549862265586853),
 (30633, 0.7474336624145508),
 (31336, 0.7390131950378418),
 (3807, 0.7383760809898376),
 (6716, 0.738276481628418),
 (24113, 0.7340051531791687)]

In [168]:
def estimator(data):
    counts = 0
    for index in range(len(data)):
        similar_indexs = [i[0] for i in find_similar_product(data[index,0].split()) if i[0]!=index][:10] # Top 10 similar indexs of A after exclude A itself
        predicted_labels = train_data[similar_indexs,1]    # Top 10 predicted labels of A except A itself
        if data[index,1] in predicted_labels:       # if the true label of A is in Top 10 similar predicted labels, it is true
            counts += 1
    accuracy = counts/len(data)
    return accuracy

In [164]:
# show estimation accuracy of train_data
estimator(train_data)

0.3154173390054276

In [169]:
# show estimation accuracy of test_data
estimator(test_data)

0.5212121212121212

In [172]:
def Top10_similar_index(data):
    similar_list = []
    for index in range(len(data)):
        similar_indexs = [i[0] for i in find_similar_product(data[index,0].split()) if i[0]!=index][:10] # Top 10 similar indexs of A after exclude A itself
        predicted_labels = train_data[similar_indexs,1]    # Top 10 predicted labels of A except A itself
        similar_list.append(predicted_labels)
    return similar_list

In [173]:
train_imilar_list = Top10_similar_index(train_data)
test_imilar_list = Top10_similar_index(test_data)

In [174]:
import pandas as pd
train_df = pd.DataFrame(train_data, columns = ['title','label'])

In [175]:
train_df['Top10_similar_indexs'] = train_imilar_list

In [176]:
train_df.head()

Unnamed: 0,title,label,Top10_similar_indexs
0,paper bag victoria secret,249114794,"[249114794, 1000106726, 1125219914, 1420214137..."
1,double tape vhb mm x original double foam tape,2937985045,"[475342649, 475342649, 475342649, 475342649, 2..."
2,maling tts canned pork luncheon meat gr,2395904891,"[2775619691, 449938131, 1839153978, 1902043911..."
3,daster batik lengan pendek motif acak campur l...,4093212188,"[3150867956, 2453599242, 2560881468, 264184112..."
4,nescafe latte,3648931069,"[191946414, 2220491818, 4025391613, 3740972720..."


In [177]:
test_df = pd.DataFrame(test_data, columns = ['title','label'])
test_df['Top10_similar_indexs'] = test_imilar_list
test_df.head()

Unnamed: 0,title,label,Top10_similar_indexs
0,karet kucir premium,994676122,"[2253967885, 46287381, 3606023430, 3308317766,..."
1,karet ikat rambut elastis untuk wanita,994676122,"[2665537113, 3952236600, 426500726, 1897963080..."
2,pcs ikat rambut karet polos elastis gaya kore...,994676122,"[3601571834, 3740972720, 665979983, 1179056926..."
3,pcs ikat rambut korea karet polos elastis gay...,994676122,"[3601571834, 2220491818, 665979983, 3889297911..."
4,korea women children hair tie head rope karet ...,994676122,"[3437394605, 513750353, 3048199336, 1251733888..."
