In [1]:
import os
import numpy as np
import pandas as pd
import re

In [2]:
from gensim.models import Word2Vec
from sklearn import model_selection, preprocessing
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers, utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load Training data

In [3]:
currentdir = os.getcwd()
input_data = pd.read_csv(os.path.join(currentdir, r'ndsc-beginner\train.csv'))
input_data.head(5)

Unnamed: 0,itemid,title,Category,image_path
0,307504,nyx sex bomb pallete natural palette,0,beauty_image/6b2e9cbb279ac95703348368aa65da09.jpg
1,461203,etude house precious mineral any cushion pearl...,1,beauty_image/20450222d857c9571ba8fa23bdedc8c9.jpg
2,3592295,milani rose powder blush,2,beauty_image/6a5962bed605a3dd6604ca3a4278a4f9.jpg
3,4460167,etude house baby sweet sugar powder,3,beauty_image/56987ae186e8a8e71fcc5a261ca485da.jpg
4,5853995,bedak revlon color stay aqua mineral make up,3,beauty_image/9c6968066ebab57588c2f757a240d8b9.jpg


In [7]:
# Load training data
train_data = input_data[['title', 'Category']]
print(train_data.shape)
print(train_data.head())

data = train_data.sample(frac=0.5)
print(data.shape)
# data['Category'].value_counts()

(666615, 2)
                                               title  Category
0               nyx sex bomb pallete natural palette         0
1  etude house precious mineral any cushion pearl...         1
2                           milani rose powder blush         2
3                etude house baby sweet sugar powder         3
4       bedak revlon color stay aqua mineral make up         3
(333308, 2)


In [52]:
# Load and prepare test data
test = pd.read_csv(os.path.join(currentdir, r'ndsc-beginner\test.csv'))
print(test.head(5))
print(test.shape)

       itemid                                              title  \
0   370855998               flormar 7 white cream bb spf 30 40ml   
1   637234604  maybelline clear smooth all in one bb cream sp...   
2   690282890  murah innisfree eco natural green tea bb cream...   
3   930913462  loreal white perfect day cream spf 17 pa white...   
4  1039280071  hada labo cc cream ultimate anti aging spf 35 ...   

                                          image_path  
0  beauty_image/1588591395c5a254bab84042005f2a9f.jpg  
1  beauty_image/920985ed9587ea20f58686ea74e20f93.jpg  
2  beauty_image/90b40e5710f54352b243fcfb0f5d1d7f.jpg  
3  beauty_image/289c668ef3d70e1d929d602d52d5d78a.jpg  
4  beauty_image/d5b3e652c5822d2306f4560488ec30c6.jpg  
(172402,)


# Custom Word embeddings

In [108]:
text_df = train_data['title'].append(test['title'], ignore_index=True)
print(text_df.shape)
text_df.head()

0                 nyx sex bomb pallete natural palette
1    etude house precious mineral any cushion pearl...
2                             milani rose powder blush
3                  etude house baby sweet sugar powder
4         bedak revlon color stay aqua mineral make up
Name: title, dtype: object

In [109]:
# Data cleaning on title to prepare custom word embedding 
text_clean = []
for text in text_df:
    t = re.sub('[^a-zA-Z ]', '', text) # removes numbers from text: etu67de 64gb' = 'etude gb'
    t1 = re.sub(r'\b\w{1,1}\b', '', t) # removes words with length<1    
    text_clean.append(t1.split())

In [111]:
# Gensim custom word embedding model
embedding_dim = 100 # size of word_vector

word2vec_model = Word2Vec(text_clean, min_count=2, sg=1, size=embedding_dim ,workers=4)

word2vec_model.wv.save_word2vec_format('custom_embed_100d.txt')

print("Found %s unique vectors" %len(word2vec_model.wv.vocab.keys())) #29431 vectors

Found 78918 unique vectors


# Data Preparation - Training data

In [14]:
# Converting text data and labels to list of lists
texts =[]
labels = []

for i, line in data.iterrows():
    texts.append(line['title'])
    labels.append(line['Category'])

print("Found %s texts to train" % len(texts))

print(set(labels))

Found 333308 texts
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57}


In [16]:
# create a tokenizer 
tokenize = text.Tokenizer()
# fit tokenizer on texts
tokenize.fit_on_texts(texts)
# creates a dict of unique words and their ID
word_index = tokenize.word_index

print("Found %s unique tokens" % len(word_index))

Found 55552 unique tokens


In [36]:
# integer encode documents using word_index
encoded_texts = tokenize.texts_to_sequences(texts)

#to find max_sequence_length
def find_max_list(listoflist):
    list_len = [len(i) for i in listoflist]
    return max(list_len)

max_sequence_length = find_max_list(encoded_texts)+1

# pad the text vectors to make them of equal length
padded_texts = sequence.pad_sequences(encoded_texts ,padding='post',maxlen=max_sequence_length)

In [38]:
print("Texts:",texts[0:5])
print("\nEncoded Texts:",encoded_texts[0:5])
print("\nPadded Texts:",padded_texts[0:5])

Texts: ['sexy depp v neck women bodycon over hip dress', 'xiaomi mi 8 6 64 blue 2018', 'sony xperia x', 'iui fashion womens deep v black long sleeve cocktail evening gown party', 'bf kaos t shirt model longgar lengan panjang motif print untuk wanita']

Encoded Texts: [[17, 25796, 12, 7, 33, 102, 275, 934, 2], [92, 174, 191, 94, 249, 221, 142], [234, 288, 277], [25797, 80, 585, 276, 12, 42, 36, 49, 214, 360, 859, 110], [2040, 27, 46, 26, 10, 58, 1, 8, 13, 32, 5, 3]]

Padded Texts: [[   17 25796    12     7    33   102   275   934     2     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [   92   174   191    94   249   221   142     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [  234   288   277     0     0     0     0     0     0     0     0     0
      0     0     0     0     0

In [39]:
num_classes = np.max(labels)+1
labels= utils.to_categorical(labels, num_classes)

print("Shape of text data :", padded_texts.shape)
print("Shape of labels :", labels.shape)

Shape of text data : (333308, 33)
Shape of labels : (333308, 58)


# Data Preparation - Test data

In [2]:
# Converting text data to list of lists
test_texts =[]

for i, line in test.iterrows():
    test_texts.append(line['title'])

print("Found %s texts to test" % len(test_texts)) #172402 texts

In [96]:
# integer encode documents using word_index
test_encoded_texts = tokenize.texts_to_sequences(test_texts)

#to find max_sequence_length
def find_max_list(listoflist):
    list_len = [len(i) for i in listoflist]
    return max(list_len)

# test_max_sequence_length = find_max_list(test_encoded_texts)+1

# pad the text vectors to make them of equal length
x_test = sequence.pad_sequences(test_encoded_texts ,padding='post',maxlen=max_sequence_length)

In [97]:
# Split data and labels into training and validation
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(padded_texts, labels, random_state =42)

print("Shape of x_train", x_train.shape)
print("Shape of x_valid", x_valid.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_valid", y_valid.shape)
print("Shape of x_test", x_test.shape)

Shape of x_train (249981, 33)
Shape of x_valid (83327, 33)
Shape of y_train (249981, 58)
Shape of y_valid (83327, 58)
Shape of x_test (172402, 33)



# Preparing for embedding layer(custom)

In [41]:
embedding_dim = 100
vocabulary_size = len(word_index)+1

In [112]:
# load the custom pre-trained word-embedding vectors 
embeddings_index = {}
f = open('custom_embed_100d.txt', encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors' % len(embeddings_index)) #29432 word vectors

Found 78919 word vectors


In [113]:
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Conv1D model

In [117]:
sequence_input = layers.Input(shape=(max_sequence_length,), dtype='int32')

embedding_layer = layers.Embedding(vocabulary_size,
                                    embedding_dim,
                                    weights=[embedding_matrix],
                                    input_length=max_sequence_length,
                                    trainable=True)

embedded_sequences = embedding_layer(sequence_input)
x = layers.Convolution1D(256, 2, activation='relu')(embedded_sequences)
x = layers.MaxPooling1D(2)(x)
x = layers.Convolution1D(128, 2, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Convolution1D(128, 2, activation='relu')(x)
x = layers.MaxPooling1D(2)(x) 
x = layers.Flatten()(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.25)(x)
preds = layers.Dense(num_classes, activation='softmax')(x)

model = models.Model(sequence_input, preds)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 33)                0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 33, 100)           5555300   
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 32, 256)           51456     
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 16, 256)           0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 15, 128)           65664     
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 7, 128)            0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 6, 128)            32896     
__________

In [118]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam', #rmsprop
              metrics=['acc'])

history = model.fit(x_train, y_train, validation_data=(x_valid, y_valid),
          epochs=5, batch_size=128, verbose=1 ) 

Train on 249981 samples, validate on 83327 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Plot Accuracy vs Loss

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

plot_history(history)

# Submission files

In [122]:
test_label = model.predict(x_test)

In [124]:
y_test =[]
for i in test_label:
    max_id = np.where(i== np.amax(i))
    y_test.append(max_id[0][0])

print(y_test)

[5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 2, 3, 4, 5, 5, 3, 7, 5, 3, 3, 3, 5, 5, 3, 5, 3, 5, 4, 5, 5, 5, 3, 3, 5, 3, 3, 3, 3, 3, 3, 7, 5, 3, 5, 2, 3, 7, 5, 5, 3, 3, 5, 5, 4, 3, 3, 3, 1, 3, 4, 5, 5, 3, 4, 3, 5, 3, 4, 5, 5, 5, 5, 3, 3, 3, 5, 3, 3, 5, 5, 5, 3, 5, 3, 3, 3, 1, 5, 3, 5, 5, 5, 3, 5, 1, 3, 3, 5, 5, 5, 5, 3, 5, 1, 3, 1, 5, 5, 5, 5, 3, 5, 1, 3, 3, 5, 1, 3, 3, 5, 3, 1, 5, 5, 1, 5, 7, 3, 3, 5, 3, 5, 1, 3, 5, 4, 5, 5, 3, 5, 7, 5, 4, 3, 1, 3, 5, 3, 7, 3, 5, 5, 7, 3, 3, 5, 3, 7, 5, 3, 3, 1, 5, 5, 5, 3, 1, 3, 5, 5, 5, 5, 5, 3, 5, 1, 3, 3, 3, 5, 5, 2, 3, 3, 5, 3, 3, 3, 5, 5, 4, 3, 3, 5, 5, 3, 5, 5, 5, 5, 5, 1, 3, 5, 5, 5, 3, 3, 3, 5, 5, 5, 1, 5, 5, 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 5, 1, 7, 3, 5, 5, 5, 5, 3, 5, 5, 4, 5, 3, 3, 1, 4, 5, 3, 1, 5, 1, 5, 5, 5, 5, 5, 9, 3, 4, 3, 5, 5, 3, 3, 5, 3, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 1, 5, 3, 5, 5, 1, 5, 3, 5, 5, 5, 1, 3, 5, 5, 5, 3, 3, 3, 3, 3, 6, 5, 7, 5, 5, 5, 3, 5, 5, 5, 3, 5, 1, 4, 5, 5, 3, 1, 5, 5, 3, 5, 5, 7, 5, 3, 1, 3, 7, 5, 3, 3, 5, 5, 5, 5, 3, 1, 

In [125]:
test['Category'] = y_test
submission_df = test[['itemid', 'Category']]
submission_df.to_csv('submission.csv', index=False)