## Predicting Category
According to Womens+Attributes.xlsx, general category has 6 attributes: top, bottom, onepiece, shoe, handbag and scarf. From tagged data, attributes include top, bottom, onepiece, shoe, sweater, accessory, blazer, hoodie etc. To combine these two categorization, I chose to keep top, bottom, onepiece, shoe, accessory only and blazer, sweater and hoodie are included in top.

In [1]:
import pandas as pd
import numpy as np
import base64
import re
from sklearn.feature_extraction.text import CountVectorizer
import string
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy
max_iter = 20000
import warnings
warnings.filterwarnings("ignore")
import pickle

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.models import load_model
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

def clean_text(text):
    '''
    use regular expression to clean text 
    replace numbers and units to variables
    '''
    p = re.compile(r'<.*?>')
    text = p.sub('', text)
    text = text.lower()
    text = re.sub('\xa0', '',text)
    text = re.sub(r'\d{1,3}(\.|\’)?\d{1,3}?(\"|\”)',"length_val", text)
    text = re.sub(r'\d{1,3}\s*?%',"percentage_val", text)
    text = text.strip(string.punctuation).replace("\n", " ").replace("\r", " ")
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d{1,3}\s*?mm',"mm_val", text)
    text = re.sub(r'\d{1,3}\s*?cm',"cm_val", text)
    text = re.sub(r'\d{1,3}\s*?(inches|inch)',"inches_val", text)
    text = re.sub(r'\d{1,3}\s*?(lbs|kg)',"weight_val", text)
    text = re.sub(r'size\s*?\d{1,3}\s*?',"size_val", text)
    text = re.sub(r'\b\d+\b',' ',text)
    text = re.sub(r'\s+',' ',text)
    return text

def spacy_tokenizer(sentence):
    '''
    Tokenize and lemmatize texts and remove stopwords
    '''
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return mytokens

In [3]:
def clean_cate_train(inputFile):
    '''
    Input training dataset file
    Keep relevant columns 
    Basic cleaning
    Output cleaned training dataset for category in a dataframe
    '''
    import pandas as pd
    train_data = pd.read_csv(inputFile)
    train_data = train_data[['product_id', 'product_color_id','brand', 'brand_category','product_full_name', 'description',  'details','attribute_name', 'attribute_value']]
    train_data.drop_duplicates(inplace = True)
    train_data_cate = train_data[train_data.attribute_name.str.lower() == 'category']
    train_data_cate.attribute_value = train_data_cate.attribute_value.str.lower()
    attribute_value = ['top'   if i in ['blazerscoatsjackets', 'blazers, coats & jackets','sweatshirthoodie', 'sweater', 'sweatshirt & hoodie','sweatshirt & hoodie'] else i.lower() for i in train_data_cate.attribute_value]
    attribute_value = ['onepiece'   if i in ['one piece', 'one-piece'] else i.lower() for i in attribute_value]
    train_data_cate.attribute_value = attribute_value
    train_data_cate.fillna('Unknown_token', inplace = True)    
    dummies = pd.get_dummies(train_data_cate['attribute_value'])
    train_data_cate = pd.concat([train_data_cate, dummies], axis = 1)
    train_data_cate.reset_index(inplace = True)
    train_data_cate = train_data_cate.drop('index', axis = 1)
    X = train_data_cate['description'] + ' '+train_data_cate['product_full_name'] + ' '+train_data_cate['details'] + ' '+train_data_cate['brand_category']
    df_y = pd.DataFrame(train_data_cate.iloc[:,-5:], index = train_data_cate.index)
    X = [clean_text(i) for i in X]
    return train_data_cate, X, df_y


In [4]:
punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

def clean_text(text):
    '''
    use regular expression to clean text 
    replace numbers and units to variables
    '''
    p = re.compile(r'<.*?>')
    text = p.sub('', text)
    text = text.lower()
    text = re.sub('\xa0', '',text)
    text = re.sub(r'\d{1,3}(\.|\’)?\d{1,3}?(\"|\”)',"length_val", text)
    text = re.sub(r'\d{1,3}\s*?%',"percentage_val", text)
    text = text.strip(string.punctuation).replace("\n", " ").replace("\r", " ")
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d{1,3}\s*?mm',"mm_val", text)
    text = re.sub(r'\d{1,3}\s*?cm',"cm_val", text)
    text = re.sub(r'\d{1,3}\s*?(inches|inch)',"inches_val", text)
    text = re.sub(r'\d{1,3}\s*?(lbs|kg)',"weight_val", text)
    text = re.sub(r'size\s*?\d{1,3}\s*?',"size_val", text)
    text = re.sub(r'\b\d+\b',' ',text)
    text = re.sub(r'\s+',' ',text) 
    mytokens = parser(text)
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return " ".join(mytokens)

In [5]:
def clean_cate_test(inputFile):
    '''
    Input testing dataset that is going to be labeled
    Keep relevant columns 
    Basic cleaning
    Output cleaned testing dataset for category in a dataframe
    '''
    full_test_data = pd.read_csv(inputFile)
    test_data = full_test_data[['product_full_name', 'details','description', 'brand_category']]
    test_data.fillna('Unknown_token', inplace = True)
    X_test = test_data['product_full_name'] + ' '+ test_data['details'] + ' '+test_data['description']+ ' '+test_data['brand_category']
    return test_data, X_test

In [6]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [7]:
def get_max_token_length_per_doc(docs):
    return max(list(map(lambda x: len(x.split()), docs)))

In [8]:
def get_pred_classes(mat):
    pred = list(map(lambda v: list(np.argsort(v))[-1:], mat))
    return pred

In [18]:
def get_true_classes(df):
    tclas=list()
    for v in df.values:
        tl = []
        for i,a in enumerate(v):
            if a == 1:
                tl.append(i)
        tclas.append(tl)
    return tclas

In [24]:
def compare(l1,l2):
    
    m=0
    for i in range(len(l2)):
        pred = set(l1[i])
        true = set(l2[i])
        
        if len(pred.intersection(true)) != 0:
            m += 1
    return m/len(l1)

### Testing accuracy

In [None]:
train_data_cate, X_clean, y_clean = clean_cate_train('new_merged_15W.csv')

In [10]:
max_length = get_max_token_length_per_doc(X_clean)
tk = Tokenizer(oov_token = 'Unknown_token')
tk.fit_on_texts(X_clean)
vocab_size = len(tk.word_index) + 1
vector_text = tk.texts_to_sequences(X_clean)
padded_token_lists = pad_sequences(vector_text, maxlen=max_length, padding='post')

In [12]:
df_x = pd.DataFrame(padded_token_lists, index = train_data_cate.index)
df_y = pd.DataFrame(train_data_cate.iloc[:,-5:], index = train_data_cate.index)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_x,df_y, 
                                                    test_size=0.3,
                                                    random_state = 20)

In [14]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
model.add(Flatten())
model.add(Dense(50,activation='tanh'))
model.add(Dense(5, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=200,epochs=100,verbose=0)

<keras.callbacks.callbacks.History at 0x1f82e643c88>

In [15]:
pred_vectors_train = model.predict(X_train)
pred_vectors_test = model.predict(X_test)

In [16]:
train_pred_classes = get_pred_classes(pred_vectors_train)
test_pred_classes = get_pred_classes(pred_vectors_test)

In [20]:
train_true_classes = get_true_classes(y_train)
test_true_classes = get_true_classes(y_test)

In [25]:
compare(train_pred_classes,train_true_classes)

0.9991616013414378

In [26]:
compare(test_pred_classes,test_true_classes)

0.9931573802541545

## Using all tagged data to train model

In [31]:
train_data_cate, X_train, y_train = clean_cate_train('new_merged_15W.csv')

In [32]:
max_length = get_max_token_length_per_doc(X_train)
tk = Tokenizer(oov_token = 'Unknown_token')
tk.fit_on_texts(X_train)
vocab_size = len(tk.word_index) + 1
vector_text = tk.texts_to_sequences(X_train)
padded_token_lists = pad_sequences(vector_text, maxlen=max_length, padding='post')

In [33]:
df_x = pd.DataFrame(padded_token_lists, index = train_data_cate.index)
df_y = pd.DataFrame(train_data_cate.iloc[:,-5:], index = train_data_cate.index)

In [34]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
model.add(Flatten())
model.add(Dense(50,activation='tanh'))
model.add(Dense(5, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(df_x,df_y,batch_size=200,epochs=100,verbose=0)

<keras.callbacks.callbacks.History at 0x1f82c95d448>

In [54]:
Pkl_Filename = "category_model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

In [55]:
Pkl_Filename = "category_token.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(tk, file)

## Predict full data set using trained model

In [35]:
full_test_data, X_test = clean_cate_test('full_data_final version.csv')
X_test = X_test.apply(clean_text)

In [37]:
vector_text_test = tk.texts_to_sequences(X_test)
padded_token_lists_test = pad_sequences(vector_text_test, maxlen=max_length, padding='post')
X_test = pd.DataFrame(padded_token_lists_test, index = full_test_data.index)

In [38]:
pred_vectors_test = model.predict(X_test)

In [39]:
test_pred_classes = get_pred_classes(pred_vectors_test)

In [40]:
categories = list(y_train.columns)

In [41]:
cate_pred = [categories[i[0]] for i in test_pred_classes]

In [42]:
predicted_test = pd.Series(cate_pred).str.capitalize() 
full_test_data['category']  = predicted_test

In [43]:
full_test_data

Unnamed: 0,product_full_name,details,description,brand_category,category
0,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...","A modern pump, in a rounded silhouette with an...",Unknown,Shoe
1,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Dress it down with jeans and sneakers or dress...,Unknown,Onepiece
2,52MM Padded Leather Round Sunglasses,100% UV protection Case and cleaning cloth inc...,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,Accessory
3,Baby's & Little Kid's All-Star Two-Tone Mid-To...,Canvas upper Round toe Lace-up vamp SmartFOAM ...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",Shoe
4,64MM Rimless Sunglasses,100% UV protection Gradient lenses Adjustable ...,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,Accessory
...,...,...,...,...,...
48974,Baby's Hooded Jacket,Cozy double breasted jacket crafted from cotto...,Unknown_token,JustKids/Baby024months/InfantGirls/Outerwear,Top
48975,Flawless Fusion Ultra-Longwear Foundation,"WHAT IT ISA 15-hour long wearing, water resist...","WHAT IT ISA 15-hour long wearing, water resist...",SaksBeautyPlace/ForHer/Color/Foundation/Liquid...,Top
48976,Baby Girl's 2-Piece Ruffle Sweatshirt & Stripe...,"Crewneck Long sleeves Rib-knit neck, cuffs and...",Ruffled-trim sweatshirt lends romance to this ...,"JustKids/Baby024months/InfantGirls/Tops,JustKi...",Top
48977,Little Girl's Plaid & Velvet Dress,Peter Pan collar Short sleeves Back zipper Two...,Pretty plaid dress with velvet collar and velv...,"JustKids/Girls214/ToddlerGirls24/Dresses,JustK...",Onepiece


In [57]:
full_test_data.to_csv('predicted_category.csv')