In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten, LSTM
from keras.layers import Embedding
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.layers import Input
from keras.layers.merge import Concatenate

Using TensorFlow backend.


In [5]:
data = pd.read_csv("occasion_tag_all.csv", index_col=0)
for i in data.columns:
    data[i] = data[i].str.lower()
print(data.shape)
data.head()

(21263, 9)


Unnamed: 0,product_id,brand,name,description,brand_category,details,product_color_id,attribute_name,attribute_value
2,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,weekend
3,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,work
12,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,day to night
15,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,night out
30,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,work


In [6]:
data['attribute_value'].unique()

array(['weekend', 'work', 'day to night', 'night out', 'nightout',
       'daytonight', 'vacation', 'workout', 'coldweather'], dtype=object)

In [9]:
def regex_cleaning(value):
    new_value = re.sub("nightout",'night out', value)
    new_value1 = re.sub("daytonight", 'day to night', new_value)
    new_value2 = re.sub("coldweather", 'cold weather', new_value1)
    return new_value2
data['attribute_value'] = data['attribute_value'].apply(regex_cleaning)

In [10]:
data['attribute_value'].unique()

array(['weekend', 'work', 'day to night', 'night out', 'vacation',
       'workout', 'cold weather'], dtype=object)

In [11]:
# Check whether we have duplicate records
data.groupby('product_id')['attribute_value'].count()

product_id
01dpc9gstt72khnn0mndnkh7rd    12
01dpcb2keavxxkfvm7fxbne4vy    20
01dpcg1c1p0mqav9nms3n1tdaa     8
01dpchnew5f2rhjq3njmvpk6se    96
01dpchnqm0pa0sxzzzx85pf2zj     4
                              ..
01e6076gtce5p3vh76vwjh4my9     3
01e6078g3gratf2c96vkyywsgd     2
01e6079dg58yw9k78d57c6j2y1     3
01e6079qfkh4hpzfq31t6wdrrx     1
01e607bhrqajdz76mjfn7rprk1     2
Name: attribute_value, Length: 3914, dtype: int64

In [12]:
# remove duplicate records
data.drop_duplicates(subset =["product_id","product_color_id",'attribute_name','attribute_value'],
                     keep = "first",inplace = True) 
data.shape

(12625, 9)

In [13]:
#distribution of each occasion attribute value
data['attribute_value'].value_counts(normalize=True)

weekend         0.329426
day to night    0.301307
work            0.148277
night out       0.095842
vacation        0.077069
cold weather    0.027564
workout         0.020515
Name: attribute_value, dtype: float64

In [23]:
# drop records with both null value in description and details
data.dropna(subset=["description", "details"], how='all',inplace =True)

In [24]:
data.isnull().sum()

product_id             0
brand                  0
name                   0
description          888
brand_category       715
details             1101
product_color_id       0
attribute_name         0
attribute_value        0
dtype: int64

In [37]:
# Combine details and description into one "Text" column
data= data.replace(np.nan, '', regex=True)
data['text'] = data['description']+' '+data['details']
data['id'] = data['product_id']+data['product_color_id']

In [26]:
dummies = pd.get_dummies(data['attribute_value'])
dummies.head()

Unnamed: 0,cold weather,day to night,night out,vacation,weekend,work,workout
2,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0
12,0,1,0,0,0,0,0
15,0,0,1,0,0,0,0
94,0,0,1,0,0,0,0


In [27]:
data = pd.concat([data, dummies], axis=1, sort=False)

In [38]:
data.head()

Unnamed: 0,product_id,brand,name,description,brand_category,details,product_color_id,attribute_name,attribute_value,text,cold weather,day to night,night out,vacation,weekend,work,workout,id
2,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,weekend,"beige stretch-silk slips on 93% silk, 7% spand...",0,0,0,0,1,0,0,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...
3,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,work,"beige stretch-silk slips on 93% silk, 7% spand...",0,0,0,0,0,1,0,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...
12,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,day to night,"beige stretch-silk slips on 93% silk, 7% spand...",0,1,0,0,0,0,0,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...
15,01dtjcerf6f4nrz2wsjffa1eys,theory,teah stretch-silk camisole,"beige stretch-silk slips on 93% silk, 7% spand...",clothing / tops / tanks and camis,"fits true to size, take your normal size\ncut ...",01dtjcergehdmq5fajqs2pz6nj,occasion,night out,"beige stretch-silk slips on 93% silk, 7% spand...",0,0,1,0,0,0,0,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...
94,01dvpbj6464ykygvae0a1hmkgn,alexander wang,layered velvet mini dress,black velvet concealed hook and zip fastening ...,clothing / dresses / mini,"fits true to size, take your normal size \ndes...",01dvpbj64g517982dcfq9wsfn0,occasion,night out,black velvet concealed hook and zip fastening ...,0,0,1,0,0,0,0,01dvpbj6464ykygvae0a1hmkgn01dvpbj64g517982dcfq...


## Description & Details

In [39]:
text = data.iloc[:,[-1,9,10,11,12,13,14,15,16]]
text.head()

Unnamed: 0,id,text,cold weather,day to night,night out,vacation,weekend,work,workout
2,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...,"beige stretch-silk slips on 93% silk, 7% spand...",0,0,0,0,1,0,0
3,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...,"beige stretch-silk slips on 93% silk, 7% spand...",0,0,0,0,0,1,0
12,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...,"beige stretch-silk slips on 93% silk, 7% spand...",0,1,0,0,0,0,0
15,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...,"beige stretch-silk slips on 93% silk, 7% spand...",0,0,1,0,0,0,0
94,01dvpbj6464ykygvae0a1hmkgn01dvpbj64g517982dcfq...,black velvet concealed hook and zip fastening ...,0,0,1,0,0,0,0


In [40]:
text = text.groupby(['id','text']).sum().reset_index()
text.head()

Unnamed: 0,id,text,cold weather,day to night,night out,vacation,weekend,work,workout
0,01dpc9gstt72khnn0mndnkh7rd01dpc9gstzypv54n9b1t...,"your new wear-to-work essential, this roomy to...",0,1,0,0,0,1,0
1,01dpc9gstt72khnn0mndnkh7rd01dpc9gtp1qdcw3exqtt...,"your new wear-to-work essential, this roomy to...",0,1,0,0,0,1,0
2,01dpc9gstt72khnn0mndnkh7rd01dpc9gvg1zn8rrax77s...,"your new wear-to-work essential, this roomy to...",0,1,0,0,0,1,0
3,01dpcb2keavxxkfvm7fxbne4vy01dpcb2kef1cfywfwa97...,a classic fall silhouette in a classic fall-fr...,0,1,0,0,1,1,0
4,01dpcb2keavxxkfvm7fxbne4vy01dpcb2mmtpctv3rnjv2...,a classic fall silhouette in a classic fall-fr...,0,1,0,0,1,1,0


In [46]:
# check one product
text[text["id"]=="01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs2pz6nj"]

Unnamed: 0,id,text,cold weather,day to night,night out,vacation,weekend,work,workout
877,01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs...,"beige stretch-silk slips on 93% silk, 7% spand...",0,1,1,0,1,1,0


## Creating Embedding: Description

In [47]:
docs = text['text']
labels = text.iloc[:,2:]

In [48]:
# Preprocessing: remove stopwords/punctuation, lemmatization
import spacy
nlp = spacy.load('en_core_web_md')

def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    #remove stopwords and do lemmatization
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return " ".join(tokens)

In [49]:
X=[]
for sen in list(docs):
    X.append(preprocess_text(sen))

y=labels.values

In [50]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2,random_state=42)

## Convert text inputs to embedded vectors

In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

In [52]:
from keras.preprocessing.text import text_to_word_sequence
from typing import List
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))
max_length = get_max_token_length_per_doc(docs)
max_length

256

In [53]:
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

In [54]:
X_train.shape

(4319, 256)

In [55]:
# using GloVe word embeddings to convert text inputs 
# to their numeric counterparts.

def load_glove_vectors():
    embeddings_index = {}
    with open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index

embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


In [56]:
EMBEDDING_SIZE = 100
embedding_matrix = zeros((vocab_size, EMBEDDING_SIZE))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

## Predictive Model: LSTM

In [62]:
deep_inputs = Input(shape=(max_length,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(7, activation='sigmoid')(LSTM_Layer_1)
model_lstm = Model(inputs=deep_inputs, outputs=dense_layer_1)

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [63]:
model_lstm.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a77c9eed0>

In [64]:
# Predict on test dataset
score = model_lstm.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.4656688606297528
Test Accuracy: 0.7973544597625732


In [65]:
## Try another random state
X_train2, X_test2, y_train2, y_test2 = \
    train_test_split(X, y, test_size=0.2,random_state=101)

In [66]:
tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(X_train2)

X_train2 = tokenizer.texts_to_sequences(X_train2)
X_test2 = tokenizer.texts_to_sequences(X_test2)

vocab_size = len(tokenizer.word_index) + 1

X_train2 = pad_sequences(X_train2, padding='post', maxlen=max_length)
X_test2 = pad_sequences(X_test2, padding='post', maxlen=max_length)

In [67]:
model_lstm.fit(X_train2, y_train2, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a77364c50>

In [68]:
score2 = model_lstm.evaluate(X_test2, y_test2, verbose=1)

print("Test Score:", score2[0])
print("Test Accuracy:", score2[1])

Test Score: 0.4697401214528967
Test Accuracy: 0.7961640954017639
