In [1]:
import pandas as pd
import numpy as np
import pickle
from keras.models import load_model
import warnings
import nltk
nltk.download('averaged_perceptron_tagger')
warnings.filterwarnings('ignore')

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

Using TensorFlow backend.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\YuYao\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [2]:
df = pd.read_csv('full_data_final version.csv')

In [3]:
df = df.iloc[:,np.r_[0:2,3:6,9:11]]
df.drop_duplicates(inplace=True)
df.set_index(['product_id'], inplace=True)

## I. Style - Nanchun (Aslan) Shi

In [4]:
df1 = df.copy()

### 1.1 Embedding

In [5]:
## select columns to be used for embedding model

emb_df = df1.loc[:,['description','details']]

In [6]:
## import from self-created module; check Preprocessing.py for details

from Preprocessing import embedding_preprocessing
emb_pre = embedding_preprocessing()

In [7]:
## preprocessing

emb_vector_df = pd.DataFrame(emb_pre.preprocess(emb_df), index = emb_df.index)

In [8]:
## load embedding model

emb_model = load_model('style_embedding_model.h5')

OSError: Unable to open file (unable to open file: name = 'style_embedding_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
## predict

emb_pred_vectors = emb_model.predict(emb_vector_df)

### 1.2 TF-IDF

In [None]:
## select columns to be used for tf-idf model

tfidf_df = df1.loc[:,['brand','product_full_name','brand_category','brand_canonical_url']]

In [None]:
## import from self-created module; check Preprocessing.py for details

from Preprocessing import tfidf_preprocessing
tfidf_pre = tfidf_preprocessing()

In [None]:
## preprocessing

tfidf_vector_df = tfidf_pre.preprocess(tfidf_df).set_index(tfidf_df.index)

In [None]:
## load tf-idf model

tfidf_model = load_model('style_tfidf_model.h5')

In [None]:
## predict

tfidf_pred_vectors = tfidf_model.predict(tfidf_vector_df)

### 1.3 Prediction

In [None]:
def get_pred_classes(mat):
    pred = list(map(lambda v: list(np.argsort(v))[-2:], mat))
    return np.array(pred)

label_dict = load_obj('style_label_dict_rev')

In [None]:
final_vectors = 0.4*emb_pred_vectors + 0.6*tfidf_pred_vectors

In [None]:
final_pred_classes = get_pred_classes(final_vectors)

In [None]:
df1['style_prediction'] = list(map(lambda x: [label_dict[x[0]], label_dict[x[1]]], final_pred_classes))

In [None]:
df1.head(3)

## II. Fit - Xinyi (Alex) Guo

In [9]:
import string 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import pickle
import warnings
warnings.filterwarnings('ignore')

In [10]:
df2 = pd.read_csv('full_data_final version.csv')

### 2.1 Preprocessing Functions

In [11]:
def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

In [12]:
nltk_stopwords = set(stopwords.words("English"))
def removeStopwords(text, stopwords=nltk_stopwords):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    cleanedText = " ".join(newWords)
    return cleanedText

In [13]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    lemmatizedWords = [lemmatizer.lemmatize(word.lower()) for word in words]
    lemmatizedText = " ".join(lemmatizedWords)
    return lemmatizedText

In [14]:
def preprocessing(df, columns = ["brand", "product_full_name", "description", "details"]):
    df['details'] = df['details'].str.replace("\n", "")
    #replace null values with UNKNOWN_TOKEN
    df['brand'] = df['brand'].fillna('UNKNOWN_TOKEN')
    df['description'] = df['description'].fillna('UNKNOWN_TOKEN')
    df['details'] = df['details'].fillna('UNKNOWN_TOKEN')
    df['product_full_name'] = df['product_full_name'].fillna('UNKNOWN_TOKEN')
    #remove punctuation and stopwords then lemmatize
    for col in columns: 
        df[col] = df[col].apply(removePunctuation)
        df[col] = df[col].apply(removeStopwords)
        df[col] = df[col].apply(lemmatize)
    return df

### 2.2 Keras Modeling Functions

In [15]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [16]:
def get_max_token_length_per_doc(docs):
    return max(list(map(lambda x: len(x.split()), docs)))

In [17]:
def predict(X_test_df, target, max_length):
    #load data
    X_test_df["input_doc"] = X_test_df.brand + " " + X_test_df.product_full_name + " " \
                                + X_test_df.description + " " + X_test_df.details 
    X_test = X_test_df.loc[:, "input_doc"].values
    test_docs = list(X_test)

    #load model
    model = load_model("{}_model.h5".format(target))
    with open('{}_tokenizer.pickle'.format(target), 'rb') as handle:
        tokenizer = pickle.load(handle)
        
    #predict
    encoded_test_docs = integer_encode_documents(test_docs, tokenizer)
    padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
    prediction_proba = model.predict(padded_test_docs, verbose = 0)
    
    return prediction_proba

### 2.3 Main Function

In [18]:
def main(df):
    '''
    This function will predict the fit of the clothing. It takes a dataframe as an input. The CSV file needs to have 
    "brand", "product_full_name", "description", and "details" columns. The function will output a dataframe with an 
    additional fit column. 
    '''
    #load data
#     inputFile = input("What's the name of the csv file? (ex. full_data.csv)")
    fullData = df
    #Preprocess data
    print("Start preprocessing data...")
    testData = fullData.copy()
    testData = testData.loc[:, ["brand", "product_full_name", "description", "details"]]
    testData = preprocessing(testData)
    print("Start predicting fit...")
    #Predict fit
    maxLengthDict = {'straightregular': 185,
                 'semifitted': 185,
                 'relaxed': 202,
                 'oversized': 202,
                 'fittedtailored': 202}
    prob_df = pd.DataFrame()
    fitType = ['straightregular', 'semifitted', 'relaxed', 'oversized', 'fittedtailored']
    for fit in fitType:
        prediction_proba = predict(testData, target = fit, max_length = maxLengthDict[fit])
        prob_df[fit] = prediction_proba.flatten()
        print(fit, "fit prediction done")
    prob_df['predict_fit'] = prob_df.idxmax(axis=1)
    fullData['fit'] = prob_df['predict_fit']
#     fullData.to_csv("full_data with fit prediction.csv")
    return fullData

In [19]:
df2 = main(df2)

Start preprocessing data...
Start predicting fit...


OSError: Unable to open file (unable to open file: name = 'straightregular_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
df2.head()

## III. Occasion - Bingru Xue

In [20]:
df3 = df.copy()

### 3.1 Preprocess

In [21]:
for i in df3.columns:
    df3[i] = df3[i].str.lower()
df3= df3.replace(np.nan, 'UNKNOWN_TOKEN', regex=True)
df3['details'] = df3['details'].str.replace("\n", "")
df3['text'] = df3['description']+' '+df3['details']

In [22]:
import spacy
import re
nlp = spacy.load('en_core_web_md')

def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    #remove stopwords and do lemmatization
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return " ".join(tokens)

df3['text'] = df3['text'].apply(preprocess_text)
df3['product_full_name'] = df3['product_full_name'].apply(preprocess_text)
df3['brand_category'] = df3['brand_category'].apply(preprocess_text)

In [23]:
def preprocess_url(url):
    url = re.sub('https://www.', '', url)
    url = re.sub('.com', '', url)
    url = re.sub('/', ' ', url)
    url = re.sub('-', ' ', url)
    url = re.sub(r'[0-9]+', ' ', url)
    url = re.sub(r"\s+[a-zA-Z]\s+", ' ', url)
    url = re.sub(r'\s+', ' ', url)

    doc = nlp(url)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return " ".join(tokens)

df3['brand_canonical_url'] = df3['brand_canonical_url'].apply(preprocess_url)

In [24]:
df3['brand_info'] = df3['brand']+' '+df3['product_full_name']+' '+\
                    df3['brand_category']+' '+df3['brand_canonical_url']

### 3.2 Embedding Model: Description & Detail

In [25]:
docs = df3['text']

In [26]:
from nltk import word_tokenize
tokenizer = load_obj("occasion_tokeniver")
new_doc = []
def replace_oov(sentence):
    now_sen =[]
    for word in word_tokenize(sentence):
        if word in tokenizer.word_index.keys():
            now_sen.append(word)
        else:
            now_sen.append("UNKNOWN_TOKEN")
    return " ".join(now_sen)
docs.apply(replace_oov)

product_id
01DSE9TC2DQXDG6GWKW9NMJ416    modern pump rounded silhouette ankle strap ext...
01DSE9SKM19XNA6SJP36JZC065    dress jean sneaker dress tailor trouser heel t...
01DSJX8GD4DSAP76SPR85HRCMN    padded leather cover classic round sunglass uv...
01DSJVKJNS6F4KQ1QM6YYK9AW2    iconic mid design get add dose support padded ...
01DSK15ZD4D5A0QXA8NSD25YXE    UNKNOWN_TOKEN shade offer UNKNOWN_TOKEN view i...
                                                    ...                        
01DSNVXY8EJ9FQAJ3MPDMPASHD    unknown token cozy double breasted jacket craf...
01DSGYHA3RMCHENBJVQPBGXM97    UNKNOWN_TOKEN hour long wear water resistant U...
01DSJT8H12CAFQQH07SQSQWJ8C    ruffled trim sweatshirt lend romance stripe le...
01DSH2PF9J7QZ44D842B3GMCFN    pretty plaid dress velvet collar velvet bow po...
01DSH54D3PWHKFZK5A8A2JE3RQ    unknown token corduroy dress bow cotton cordur...
Name: text, Length: 48090, dtype: object

In [27]:
from keras.preprocessing.sequence import pad_sequences
token = tokenizer.texts_to_sequences(docs)
pad = pad_sequences(token, padding='post', maxlen=165, truncating='post')

In [28]:
embedding_model = load_model('occasion_embedding_model.h5')

In [29]:
occasion_type = ["cold weather","day to night","night out","vacation","weekend","work","workout"]
pred = embedding_model.predict(pad)
embedding_df = pd.DataFrame(data= pred, columns = occasion_type,index=df3.index)

### 3.3 Vectorization Model: Brand, Name, URL, Brand Category

In [30]:
info = df3['brand_info']

In [31]:
vectorizer = load_obj("occasion_vectorizer")
vector = vectorizer.transform(info)
tf_idf_df = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names())

In [32]:
occasion_type = ["cold weather","day to night","night out","vacation","weekend","work","workout"]
vector_df = pd.DataFrame(columns = occasion_type, index = df3.index)

for label in occasion_type:
    filename = "{}".format(label)+"_vector_model"
    vector_model = load_obj(filename)
    prob = vector_model.predict_proba(vector)[:,1]
    vector_df[label] = prob

### 3.4 Combine embedding model and vector model

In [33]:
occasion_result_df = 0.4*embedding_df + 0.6*vector_df

In [34]:
def decision(probs):
    if sum(probs>0.5)>0:
        probs[probs > 0.5] = 1
        probs[probs <= 0.5] = 0
    else:
        probs[probs == np.max(probs)] = 1
        probs[probs != np.max(probs)] = 0
    return probs

occasion_result_df.apply(decision, axis=1)

Unnamed: 0_level_0,cold weather,day to night,night out,vacation,weekend,work,workout
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01DSE9TC2DQXDG6GWKW9NMJ416,0.0,1.0,0.0,0.0,0.0,0.0,0.0
01DSE9SKM19XNA6SJP36JZC065,0.0,1.0,0.0,0.0,1.0,1.0,0.0
01DSJX8GD4DSAP76SPR85HRCMN,0.0,0.0,0.0,1.0,1.0,0.0,0.0
01DSJVKJNS6F4KQ1QM6YYK9AW2,0.0,1.0,0.0,0.0,1.0,0.0,0.0
01DSK15ZD4D5A0QXA8NSD25YXE,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
01DSNVXY8EJ9FQAJ3MPDMPASHD,1.0,1.0,0.0,0.0,1.0,0.0,0.0
01DSGYHA3RMCHENBJVQPBGXM97,0.0,0.0,0.0,1.0,1.0,0.0,0.0
01DSJT8H12CAFQQH07SQSQWJ8C,0.0,1.0,0.0,0.0,1.0,0.0,0.0
01DSH2PF9J7QZ44D842B3GMCFN,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [35]:
occasion = []
for i in occasion_result_df.index:
    label = []
    for j in occasion_result_df.columns:
        if occasion_result_df.loc[i,j].any():
            label.append(j)
    occasion.append(label)
occasion_result_df['Occasion'] = occasion

In [36]:
occasion_result_df

Unnamed: 0_level_0,cold weather,day to night,night out,vacation,weekend,work,workout,Occasion
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01DSE9TC2DQXDG6GWKW9NMJ416,0.0,1.0,0.0,0.0,0.0,0.0,0.0,[day to night]
01DSE9SKM19XNA6SJP36JZC065,0.0,1.0,0.0,0.0,1.0,1.0,0.0,"[day to night, weekend, work]"
01DSJX8GD4DSAP76SPR85HRCMN,0.0,0.0,0.0,1.0,1.0,0.0,0.0,"[vacation, weekend]"
01DSJVKJNS6F4KQ1QM6YYK9AW2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,"[day to night, weekend]"
01DSK15ZD4D5A0QXA8NSD25YXE,0.0,0.0,0.0,0.0,1.0,0.0,0.0,[weekend]
...,...,...,...,...,...,...,...,...
01DSNVXY8EJ9FQAJ3MPDMPASHD,1.0,1.0,0.0,0.0,1.0,0.0,0.0,"[cold weather, day to night, weekend]"
01DSGYHA3RMCHENBJVQPBGXM97,0.0,0.0,0.0,1.0,1.0,0.0,0.0,"[vacation, weekend]"
01DSJT8H12CAFQQH07SQSQWJ8C,0.0,1.0,0.0,0.0,1.0,0.0,0.0,"[day to night, weekend]"
01DSH2PF9J7QZ44D842B3GMCFN,0.0,0.0,0.0,0.0,1.0,0.0,0.0,[weekend]


In [37]:
occasion_final = occasion_result_df.loc[:,"Occasion"]

# IV. Patterns/Prints - Jiayue (Daniel) Chen

In [66]:
import pandas as pd
import string 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import pickle

In [67]:
full_data = pd.read_csv('full_data_final version.csv')

## 4.1 Preprocessing

In [68]:
#replace null values with UNKNOWN_TOKEN
full_data['description'] = full_data['description'].fillna('UNKNOWN_TOKEN')
full_data['details'] = full_data['details'].fillna('UNKNOWN_TOKEN')
full_data['brand_category'] = full_data['brand_category'].fillna('UNKNOWN_TOKEN')
full_data['details'] = full_data['details'].str.replace("\n", "")

In [69]:
import string 
# define a function to remove punctuation
def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

nltk_stopwords = set(stopwords.words("English"))

# define a function to remove stopwords
def removeStopwords(text, stopwords=nltk_stopwords):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    cleanedText = " ".join(newWords)
    return cleanedText

# define a function to lemmatize all texts
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    lemmatizedWords = [lemmatizer.lemmatize(word.lower()) for word in words]
    lemmatizedText = " ".join(lemmatizedWords)
    return lemmatizedText

columns = ["brand", "product_full_name", "description", "details"]
for col in columns: 
    full_data[col] = full_data[col].apply(removePunctuation)
    full_data[col] = full_data[col].apply(removeStopwords)
    full_data[col] = full_data[col].apply(lemmatize)

In [70]:
full_data['abstract'] = 0
full_data['animal'] = 0
full_data['camouflage'] = 0
full_data['colorblock'] = 0
full_data['dots'] = 0
full_data['floral'] = 0
full_data['geometric'] = 0
full_data['graphic'] = 0
full_data['houndstooth'] = 0
full_data['logo'] = 0
full_data['monogram'] = 0
full_data['multiprint'] = 0
full_data['paisley'] = 0
full_data['pinstripe'] = 0
full_data['plaid'] = 0
full_data['stripe'] = 0
full_data['stripehorizontal'] = 0
full_data['stripevertical'] = 0
full_data['tiedye'] = 0
full_data['tropical'] = 0
full_data["input_doc"] = full_data.brand+" "+full_data.product_full_name+" "+full_data.description+" "+full_data.details

## 4.2 Modeling & Prediction

In [72]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

def get_max_token_length_per_doc(docs):
    return max(list(map(lambda x: len(x.split()), docs)))

In [73]:
# From the complete pattern's file, we generate a list of max_length
max_length_list = [150, 150, 124, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150]

In [74]:
pattern_values = list(full_data.columns.values[13:33])
for i in range(13, 33):
    X_test = full_data.iloc[:, -1].values
    test_docs = list(X_test)
    model = load_model("{}_model.h5".format(pattern_values[i-13]))
    with open('{}_tokenizer.pickle'.format(pattern_values[i-13]), 'rb') as handle:
        tokenizer = pickle.load(handle)
        
   #predict
    encoded_test_docs = integer_encode_documents(test_docs, tokenizer)
    padded_test_docs = pad_sequences(encoded_test_docs, maxlen = max_length_list[i-13], padding = 'post')
    prediction = model.predict(padded_test_docs, verbose = 0)
    full_data.iloc[:, i] = prediction

In [75]:
df4 = full_data.iloc[:, 13:33]
df4['patterns/prints'] = df4.idxmax(axis = 1)

In [76]:
full_data = pd.read_csv('full_data_final version.csv')
full_data['patterns/prints'] = df4['patterns/prints']
full_data.head()

Unnamed: 0,product_id,brand,mpn,product_full_name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id,patterns/prints
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,514683.0,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,2019-11-11 22:37:15.719107+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}",,animal
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,526676.0,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,2019-11-11 22:36:50.682513+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}",,floral
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,400100000000.0,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,2019-11-13 17:33:59.581661+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection Case and cleaning cloth inc...,"{""Needs Review""}",,logo
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,400012000000.0,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",2019-11-13 17:05:05.203733+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/converse-babys...,Canvas upper Round toe Lace-up vamp SmartFOAM ...,"{""Needs Review""}",,logo
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,400011000000.0,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,2019-11-13 18:42:30.941321+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/alexander-mcqu...,100% UV protection Gradient lenses Adjustable ...,"{""Needs Review""}",,animal


## V. Category - Yuyao Shen 


In [53]:
df5 = df.copy()

### 5.1 Preprocess

In [54]:
punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
from spacy.lang.en import English
parser = English()

def clean_text(text):
    '''
    use regular expression to clean text 
    replace numbers and units to variables
    '''
    p = re.compile(r'<.*?>')
    text = p.sub('', text)
    text = text.lower()
    text = re.sub('\xa0', '',text)
    text = re.sub(r'\d{1,3}(\.|\’)?\d{1,3}?(\"|\”)',"length_val", text)
    text = re.sub(r'\d{1,3}\s*?%',"percentage_val", text)
    text = text.strip(string.punctuation).replace("\n", " ").replace("\r", " ")
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d{1,3}\s*?mm',"mm_val", text)
    text = re.sub(r'\d{1,3}\s*?cm',"cm_val", text)
    text = re.sub(r'\d{1,3}\s*?(inches|inch)',"inches_val", text)
    text = re.sub(r'\d{1,3}\s*?(lbs|kg)',"weight_val", text)
    text = re.sub(r'size\s*?\d{1,3}\s*?',"size_val", text)
    text = re.sub(r'\b\d+\b',' ',text)
    text = re.sub(r'\s+',' ',text) 
    mytokens = parser(text)
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return " ".join(mytokens)

In [55]:
def clean_cate_test(inputFile):
    '''
    Input testing dataset that is going to be labeled
    Keep relevant columns 
    Basic cleaning
    Output cleaned testing dataset for category in a dataframe
    '''
    #full_test_data = pd.read_csv(inputFile)
    full_test_data = df5  
    ### keep relevant columns only
    test_data = full_test_data[['product_full_name', 'details','description', 'brand_category']]
    ### fill null values with 'Unknown_token'
    test_data.fillna('Unknown_token', inplace = True)
    X_test = test_data['product_full_name'] + ' '+ test_data['details'] + ' '+test_data['description']+ ' '+test_data['brand_category']
    return test_data, X_test

In [56]:
def get_pred_classes(mat):
    pred = list(map(lambda v: list(np.argsort(v))[-1:], mat))
    return pred

In [57]:
full_test_data, X_test = clean_cate_test('full_data_final version.csv')

### 5.2 Vectorization

In [58]:
### loading vectorizer
Pkl_Filename = "category_token.pkl"  
with open(Pkl_Filename, 'rb') as file:  
    tk = pickle.load(file)

In [59]:
### vectorize incoming data
from keras.preprocessing.sequence import pad_sequences
vector_text_test = tk.texts_to_sequences(X_test)
padded_token_lists_test = pad_sequences(vector_text_test, maxlen=175, padding='post')
X_test = pd.DataFrame(padded_token_lists_test, index = full_test_data.index)

### 5.3  Modeling 

In [60]:
### loading model
Pkl_Filename = "category_model.pkl"  
with open(Pkl_Filename, 'rb') as file:  
    model = pickle.load(file)

In [61]:
### use trained model to predict incoming data
pred_vectors_test = model.predict(X_test)
test_pred_classes = get_pred_classes(pred_vectors_test)
categories = ['accessory', 'bottom', 'onepiece', 'shoe', 'top']
cate_pred = [categories[i[0]] for i in test_pred_classes]
predicted_test = pd.Series(cate_pred).str.capitalize() 
df5['category']  = list(predicted_test)

In [80]:
df5.head()

Unnamed: 0_level_0,brand,product_full_name,description,brand_category,brand_canonical_url,details,category
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...",Shoe
01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,Onepiece
01DSJX8GD4DSAP76SPR85HRCMN,Loewe,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection Case and cleaning cloth inc...,Accessory
01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",https://www.saksfifthavenue.com/converse-babys...,Canvas upper Round toe Lace-up vamp SmartFOAM ...,Shoe
01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,https://www.saksfifthavenue.com/alexander-mcqu...,100% UV protection Gradient lenses Adjustable ...,Accessory


## Summary

In [None]:
df['style'] = df1.style_prediction
df['fit'] = df2.fit
df['occasion'] = df3.Occasion
df['patterns/prints'] = full_data['patterns/prints']
df['category'] = df5.category

In [None]:
df.head()