In [1]:
import pandas as pd
import numpy as np
import pickle
from keras.models import load_model
import warnings
warnings.filterwarnings('ignore')

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

Using TensorFlow backend.


In [2]:
df = pd.read_csv('full_data_final version.csv')

In [3]:
df = df.iloc[:,np.r_[0:2,3:6,9:11]]
df.drop_duplicates(inplace=True)
df.set_index(['product_id'], inplace=True)

## I. Style - Nanchun (Aslan) Shi

In [4]:
df1 = df.copy()

### 1.1 Embedding

In [5]:
## select columns to be used for embedding model

emb_df = df1.loc[:,['description','details']]

In [6]:
## import from self-created module; check Preprocessing.py for details

from Preprocessing import embedding_preprocessing
emb_pre = embedding_preprocessing()

In [7]:
## preprocessing

emb_vector_df = pd.DataFrame(emb_pre.preprocess(emb_df), index = emb_df.index)

In [8]:
## load embedding model

emb_model = load_model('style_embedding_model.h5')

In [9]:
## predict

emb_pred_vectors = emb_model.predict(emb_vector_df)

### 1.2 TF-IDF

In [10]:
## select columns to be used for tf-idf model

tfidf_df = df1.loc[:,['brand','product_full_name','brand_category','brand_canonical_url']]

In [11]:
## import from self-created module; check Preprocessing.py for details

from Preprocessing import tfidf_preprocessing
tfidf_pre = tfidf_preprocessing()

In [12]:
## preprocessing

tfidf_vector_df = tfidf_pre.preprocess(tfidf_df).set_index(tfidf_df.index)

In [13]:
## load tf-idf model

tfidf_model = load_model('style_tfidf_model.h5')

In [14]:
## predict

tfidf_pred_vectors = tfidf_model.predict(tfidf_vector_df)

### 1.3 Prediction

In [15]:
def get_pred_classes(mat):
    pred = list(map(lambda v: list(np.argsort(v))[-2:], mat))
    return np.array(pred)

label_dict = load_obj('style_label_dict_rev')

In [16]:
final_vectors = 0.4*emb_pred_vectors + 0.6*tfidf_pred_vectors

In [17]:
final_pred_classes = get_pred_classes(final_vectors)

In [18]:
df1['style_prediction'] = list(map(lambda x: [label_dict[x[0]], label_dict[x[1]]], final_pred_classes))

In [19]:
df1.head(3)

Unnamed: 0_level_0,brand,product_full_name,description,brand_category,brand_canonical_url,details,style_prediction
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","[modern, businesscasual]"
01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"[businesscasual, classic]"
01DSJX8GD4DSAP76SPR85HRCMN,Loewe,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection Case and cleaning cloth inc...,"[casual, classic]"


## II. Fit - Xinyi (Alex) Guo

In [2]:
import string 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
df2 = pd.read_csv('full_data_final version.csv')

### 2.1 Preprocessing Functions

In [4]:
def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

In [5]:
nltk_stopwords = set(stopwords.words("English"))
def removeStopwords(text, stopwords=nltk_stopwords):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    cleanedText = " ".join(newWords)
    return cleanedText

In [6]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    lemmatizedWords = [lemmatizer.lemmatize(word.lower()) for word in words]
    lemmatizedText = " ".join(lemmatizedWords)
    return lemmatizedText

In [7]:
def preprocessing(df, columns = ["brand", "product_full_name", "description", "details"]):
    df['details'] = df['details'].str.replace("\n", "")
    #replace null values with UNKNOWN_TOKEN
    df['brand'] = df['brand'].fillna('UNKNOWN_TOKEN')
    df['description'] = df['description'].fillna('UNKNOWN_TOKEN')
    df['details'] = df['details'].fillna('UNKNOWN_TOKEN')
    df['product_full_name'] = df['product_full_name'].fillna('UNKNOWN_TOKEN')
    #remove punctuation and stopwords then lemmatize
    for col in columns: 
        df[col] = df[col].apply(removePunctuation)
        df[col] = df[col].apply(removeStopwords)
        df[col] = df[col].apply(lemmatize)
    return df

### 2.2 Keras Modeling Functions

In [8]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [9]:
def get_max_token_length_per_doc(docs):
    return max(list(map(lambda x: len(x.split()), docs)))

In [10]:
def predict(X_test_df, target, max_length):
    #load data
    X_test_df["input_doc"] = X_test_df.brand + " " + X_test_df.product_full_name + " " \
                                + X_test_df.description + " " + X_test_df.details 
    X_test = X_test_df.loc[:, "input_doc"].values
    test_docs = list(X_test)

    #load model
    model = load_model("{}_model.h5".format(target))
    with open('{}_tokenizer.pickle'.format(target), 'rb') as handle:
        tokenizer = pickle.load(handle)
        
    #predict
    encoded_test_docs = integer_encode_documents(test_docs, tokenizer)
    padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
    prediction_proba = model.predict(padded_test_docs, verbose = 0)
    
    return prediction_proba

### 2.3 Main Function

In [11]:
def main(df):
    '''
    This function will predict the fit of the clothing. It takes a dataframe as an input. The CSV file needs to have 
    "brand", "product_full_name", "description", and "details" columns. The function will output a dataframe with an 
    additional fit column. 
    '''
    #load data
#     inputFile = input("What's the name of the csv file? (ex. full_data.csv)")
    fullData = df
    #Preprocess data
    print("Start preprocessing data...")
    testData = fullData.copy()
    testData = testData.loc[:, ["brand", "product_full_name", "description", "details"]]
    testData = preprocessing(testData)
    print("Start predicting fit...")
    #Predict fit
    maxLengthDict = {'straightregular': 185,
                 'semifitted': 185,
                 'relaxed': 202,
                 'oversized': 202,
                 'fittedtailored': 202}
    prob_df = pd.DataFrame()
    fitType = ['straightregular', 'semifitted', 'relaxed', 'oversized', 'fittedtailored']
    for fit in fitType:
        prediction_proba = predict(testData, target = fit, max_length = maxLengthDict[fit])
        prob_df[fit] = prediction_proba.flatten()
        print(fit, "fit prediction done")
    prob_df['predict_fit'] = prob_df.idxmax(axis=1)
    fullData['fit'] = prob_df['predict_fit']
#     fullData.to_csv("full_data with fit prediction.csv")
    return fullData

In [12]:
df2 = main(df2)

Start preprocessing data...
Start predicting fit...
straightregular fit prediction done
semifitted fit prediction done
relaxed fit prediction done
oversized fit prediction done
fittedtailored fit prediction done


In [13]:
df2.head()

Unnamed: 0,product_id,brand,mpn,product_full_name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id,fit
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,514683.0,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,2019-11-11 22:37:15.719107+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}",,straightregular
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,526676.0,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,2019-11-11 22:36:50.682513+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}",,semifitted
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,400100000000.0,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,2019-11-13 17:33:59.581661+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection Case and cleaning cloth inc...,"{""Needs Review""}",,semifitted
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,400012000000.0,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",2019-11-13 17:05:05.203733+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/converse-babys...,Canvas upper Round toe Lace-up vamp SmartFOAM ...,"{""Needs Review""}",,semifitted
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,400011000000.0,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,2019-11-13 18:42:30.941321+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/alexander-mcqu...,100% UV protection Gradient lenses Adjustable ...,"{""Needs Review""}",,relaxed
