## Overview

This notebook contains the codes I wrote for DSO 560 Text Analytics & NLP Final Project to predict **style** for women clothing. The client is ThreadTogether, an Australian Non-profit orgnazation.

Create on: 5.2.2020

Create by: Nanchun (Aslan) Shi

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from gensim.models.phrases import Phraser, Phrases
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import one_hot
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import SimpleRNN
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [2]:
## functions:

stopwords = set(stopwords.words('english'))
punc = string.punctuation.replace('-','')

def remove_punc_sw(text):
    
    for p in punc:
        text = text.replace(p,' ')
    text = text.replace('-', '')
    text = text.replace("’", ' ')
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    filtered_tokens = list(filter(lambda token: token not in stopwords, tokens))
    return " ".join(filtered_tokens)

def remove_numbers(text):
    
    text = re.sub(r'\b\d+\b',' ',text)
    text = re.sub(r'\s+',' ',text)
    
    return text

lemmatizer = WordNetLemmatizer()

def lemma_pos_2(text):

    for word, tag in pos_tag(text.split()):
        if tag.startswith("N"):
            yield lemmatizer.lemmatize(word, wordnet.NOUN)
        elif tag.startswith('V'):
            yield lemmatizer.lemmatize(word, wordnet.VERB)
        elif tag.startswith('J'):
            yield lemmatizer.lemmatize(word, wordnet.ADJ)
        elif tag.startswith('R'):
            yield lemmatizer.lemmatize(word, wordnet.ADV)
        else:
            yield word
            
def combine_bigrams(doc):
    
    new_doc = []
    
    for w in doc:
        w = w.replace('_','')
        new_doc.append(w)
        
    return new_doc

## function created by Professor Yu Chen

def get_max_token_length_per_doc(docs):
    return max(list(map(lambda x: len(x.split()), docs)))

def get_pred_lists(model,df,thre = 0.7):
    
    prediction = model.predict(df)
    preds = []
    for p in prediction:
        pred=[]
        for i,v in enumerate(p):
            if v >= thre:
                pred.append(i)
        if len(pred) == 0:
            pred.append(p.argmax())
        preds.append([pred])
    return preds

def check(x):
    
    pred = set(x[0])
    true = set(x[1])
    
    if len(pred.intersection(true)) == 0:
        return 0
    else:
        return 1
    
def accuracy(df):
    
    data = df.copy()
    data['p_id_color_id'] = list(zip(data.index.get_level_values(0)
                                           ,data.index.get_level_values(1)))
    data['true'] = data.p_id_color_id.map(mapper2)
    
    return data[['pred','true']].apply(check,axis=1).sum()/len(data)

punc2 = string.punctuation.replace('-','')

def remove_punc_sw_combine(x):
    
    cols = [x[0],x[1],x[2],x[3]]
    cleaned_cols = []
    
    for col in cols:
        col = col.lower()
        col = col.replace('\n',' ')
        for p in punc2:
            col = col.replace(p,' ')
        col = col.replace('-', '')
        col = col.replace("’", ' ')
        cleaned_cols.append(col)
        
    return " ".join(cleaned_cols)

def lemma(text):
    
    new_list = []
    for token in nltk.word_tokenize(text):
        token = lemmatizer.lemmatize(token)
        new_list.append(token)
    return new_list

def get_pred_classes(mat):
    
    pred = list(map(lambda v: list(np.argsort(v))[-2:], mat))
#     pred = list(map(lambda v: [np.argsort(v)[-1]], mat))
    return np.array(pred)

def get_true_classes(df):
    tclas=list()
    for v in df.values:
        tl = []
        for i,a in enumerate(v):
            if a == 1:
                tl.append(i)
        tclas.append(tl)
    return tclas

def compare(l1,l2):
    
    m=0
    for i in range(len(l2)):
        pred = set(l1[i])
        true = set(l2[i])
        
        if len(pred.intersection(true)) != 0:
            m += 1
#         if (pred.issubset(true)) or (true.issubset(pred)):
#             m+=1
    
    return m/len(l1) 

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
columns = ['p_id','brand','mpn','p_full_name','description',
           'brand_category','created_at','updated_at',
           'deleted_at','brand_canonical_url','details',
           'labels','bc_p_id','p_id_2','p_color_id','attribute_name',
           'attribute_value','file']
df = pd.read_csv('Tagged_data_2.csv',names=columns)

In [4]:
## keep important fields

df = df.iloc[:,np.r_[0:2,3:6,9:11,14:17]]

In [5]:
df.drop_duplicates(inplace=True)

## Style

In [6]:
## select style tags

sty_df = df[df.attribute_name=='style'].copy()

In [7]:
## There are lower cases

dic1 = {key:key.lower().replace(' ','') for key in sty_df.attribute_value.unique()}

In [8]:
## convert all attribute value to lower cases

sty_df['attribute_value'] = sty_df['attribute_value'].map(dic1)

In [9]:
## set unique id as index

sty_df.set_index(['p_id','p_color_id'], inplace=True)

The following sections are modeling. Given the different characteristics of **description, details**, and other text fields, I decided to treat them differently. Specifically, I will train an embedding neural network for description and detials, and train a normal MLP for other text fields. And lastly, I will take weighted average of predictions from both models and output final predictions by sorting the probabilities. 

NOTE: for style attribute, it is easily to find that a product could have multiple values. Therefore, when training the neural networks, my activation on the output layer would be "sigmoid" since it allows the "independence" from other neurons. The loss function is therefore "binary_crossentropy". The output values have shape of (X, 11), where X is the number of training set observations, and 11 is the number of unique style classes. For each observation, each number of the 11 numbers corresponds to the probability of being that class.

## 1. Embedding

Part I is the modeling for embedding neural networks.

In [10]:
## select useful columns

emb_df = sty_df.loc[:,['description','details','attribute_value']]

In [11]:
## drop nulls ONLY if both of the fields are null

emb_df.dropna(subset=['description','details'],how = 'all',inplace = True)

In [12]:
## fill remaining nulls with NOINFO

emb_df.fillna('NOINFO',inplace=True)

In [13]:
## create a unique key for mapping later

emb_df['p_id_color_id'] = list(zip(emb_df.index.get_level_values(0)
                                           ,emb_df.index.get_level_values(1)))

In [14]:
## create a dummy dataframe (one-hot) for attribute values
## sum if two rows have the same key; this means they are the same product but have multiple values
## merge back to the orginal dataframe

dummies = pd.get_dummies(emb_df.attribute_value)
cum_dumm = dummies.groupby(dummies.index)[dummies.columns].sum().reset_index()
emb_df = emb_df.merge(cum_dumm,left_on = 'p_id_color_id',right_on = 'index').drop('index',axis=1)

In [15]:
## now in the orginal dataframe there will be duplicates; if one key has multiple values, there will be multiple
## rows for that key, and values in each row are the same
## so drop them

emb_df.drop_duplicates(subset=['p_id_color_id'],inplace=True)
emb_df.set_index(pd.Index(list(emb_df['p_id_color_id'])),inplace=True)
emb_df.drop('p_id_color_id',axis=1,inplace=True)

ys = emb_df.iloc[:,3:].values

In [16]:
## note: it could be the case that an entry is bigger than 1
## this means in the original dataframe an unique key has multiple rows, but values were the same
## so we want to change them to 1 since our loss function works for 0 and 1

emb_df.iloc[:,3:] = np.where(ys > 1, 1, ys)

In [17]:
## combine description and details

emb_df['combined_text'] = emb_df[['description','details']].apply(lambda x: x[0]+' '+x[1],axis=1)

#### 1.1 Punctuation & Stopwords

In [18]:
emb_df['cleaned_text'] = emb_df.combined_text.apply(remove_punc_sw)

In [19]:
emb_df['cleaned_text'] = emb_df.cleaned_text.apply(remove_numbers)

#### 1.2 Lemmatization & POS

In [20]:
## lemmatize according to POS

emb_df['lemma_text'] = emb_df['cleaned_text'].apply(lambda text: list(lemma_pos_2(text)))

#### 1.3 Find possible bi-grams

In [21]:
## find possible bigrams and combine

phrases = Phrases(emb_df.lemma_text, min_count=30)
bi_gram = Phraser(phrases)

In [22]:
emb_df['bigram_text'] = list(bi_gram[emb_df.lemma_text])

In [23]:
## bigrams are linked by "_", don't want it to be seperated later by the tokenizer
## therefore remove it and connect the words with no space

emb_df['bigram_text'] = emb_df['bigram_text'].apply(combine_bigrams)

In [24]:
## save the phraser object for later use

# save_obj(bi_gram, 'embedding_bigram')

#### 1.4 Modeling

In [25]:
## create a mapper where each unique key is associated with a list of true labels
## will be used later

check_df = emb_df.copy()
mapper = check_df.groupby(check_df.index)['attribute_value'].apply(list)

## create a label dictionary where keys are labels, and values are number 0-10

label_dic = dict(zip(emb_df.iloc[:,3:-4].columns, range(11)))
labels = emb_df.attribute_value.map(label_dic)

In [26]:
## save label dictionary

# save_obj(label_dic,'style_label_dict')

In [27]:
## create another mapper where change the list of true lables in the first mapper to corresponding numbers

num_labels = []
for l in mapper:
    num = []
    for v in l:
        num.append(label_dic[v])
    num_labels.append(num)
    
mapper2 = pd.Series(num_labels,index=mapper.index)

#### Train own embeddings using NN

In [28]:
## join the tokens

emb_df['joined_text'] = emb_df.bigram_text.apply(lambda l: " ".join(l))

In [29]:
## maximum text length

max_length = get_max_token_length_per_doc(emb_df.joined_text)

In [30]:
## NOT USED

# all_words = []

# for l in emb_df.bigram_text:
#     for token in l:
#         all_words.append(token)

# unique_words = int(len(set(all_words))*1.2)

In [31]:
## instantize tokenizer from keras; fit on our corpus

tk = Tokenizer(oov_token = 'UNKNOWN_TOKEN')
tk.fit_on_texts(emb_df.joined_text)
vocab_size = len(tk.word_index) + 1

## transform and padding

vector_text = tk.texts_to_sequences(emb_df.joined_text)
padded_token_lists = pad_sequences(vector_text, maxlen=max_length, padding='post')

In [32]:
## NOT USED
## integer encodin & padding

# vector_text = [one_hot(v, unique_words) for v in emb_df.joined_text]
# padded_token_lists = pad_sequences(vector_text, max_length, padding='post')

In [33]:
## crate dataframe so that after sklearn train test split, indices could be kept

df_own_x = pd.DataFrame(padded_token_lists, index = emb_df.index)
df_own_y = pd.DataFrame(emb_df.iloc[:,3:-5], index = emb_df.index)

In [34]:
## set random state because we want to use the same docs for training tf-idf model

X_train, X_test, y_train, y_test = train_test_split(df_own_x,df_own_y, 
                                                    test_size=0.3,
                                                    random_state = 20)

In [35]:
## build sequential model with embedding layer

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
# model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length, mask_zero = True))
model.add(Flatten())
# model.add(LSTM(100))
model.add(Dense(100,activation='tanh'))
model.add(Dense(11, activation='sigmoid'))

In [36]:
## compile and fit

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=200,epochs=100,verbose=0)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


<keras.callbacks.callbacks.History at 0x15b0e6390>

In [37]:
## save the predicted probabilities for train and test set for use later

emb_pred_vectors_train = model.predict(X_train)
emb_pred_vectors_test = model.predict(X_test)

In [38]:
## retrieve embedding layer weights

embeddings = model.layers[0].get_weights()[0]
embedding_dict = {w:embeddings[idx] for w, idx in tk.word_index.items()}

In [39]:
## save embedding weights and tokenizer object

# save_obj(embedding_dict,'emb_dict')
# save_obj(tk,'tokenizer')

In [40]:
## save model

# model.save('embedding_model.h5')

## 2. TF-IDF

In [41]:
## select columns

tfidf_df = sty_df.loc[:,['brand','p_full_name','brand_category','brand_canonical_url','description','details','attribute_value']]

In [42]:
## because we drop row that miss both description and detials, we need to do the same here in order to 
## be consistent

tfidf_df.dropna(subset=['description','details'],how = 'all',inplace = True)
tfidf_df.drop(['description','details'],axis=1,inplace=True)

In [43]:
tfidf_df.fillna('NOINFO',inplace=True)

In [44]:
## the following few cells are doing the same as before, one could refer to the previous section

tfidf_df['p_id_color_id'] = list(zip(tfidf_df.index.get_level_values(0)
                                           ,tfidf_df.index.get_level_values(1)))

In [45]:
dummies = pd.get_dummies(tfidf_df.attribute_value)
cum_dumm = dummies.groupby(dummies.index)[dummies.columns].sum().reset_index()
tfidf_df = tfidf_df.merge(cum_dumm,left_on = 'p_id_color_id',right_on = 'index').drop('index',axis=1)

In [46]:
tfidf_df.drop_duplicates(subset=['p_id_color_id'],inplace=True)
tfidf_df.set_index(pd.Index(list(tfidf_df['p_id_color_id'])),inplace=True)
tfidf_df.drop('p_id_color_id',axis=1,inplace=True)

In [47]:
ys = tfidf_df.iloc[:,5:].values

In [48]:
tfidf_df.iloc[:,5:] = np.where(ys > 1, 1, ys)

#### 2.1 Punctuation

In [49]:
tfidf_df['all_text'] = tfidf_df[['brand','p_full_name','brand_category','brand_canonical_url']].apply(remove_punc_sw_combine,axis=1)

#### 2.2 Remove numbers & website characters

In [50]:
tfidf_df.all_text = tfidf_df.all_text.str.replace(r'(\b\d+\b)','')
tfidf_df.all_text = tfidf_df.all_text.str.replace(r'(\swww\s|\shttps*\s)','')

#### 2.3 Lemmatization

In [51]:
tfidf_df['lemma_text'] = tfidf_df.all_text.apply(lemma)

#### 2.4 Bi-grams

In [52]:
phrases3 = Phrases(tfidf_df.lemma_text, min_count=10)
bi_gram3 = Phraser(phrases3)

In [53]:
tfidf_df['bigram_text']=list(bi_gram3[tfidf_df.lemma_text])

In [54]:
tfidf_df['bigram_text'] = tfidf_df['bigram_text'].apply(combine_bigrams)

In [55]:
# save_obj(bi_gram3, 'tfidf_bigram')

#### 2.5 Vectorization

In [56]:
tfidf_df['joined_text'] = tfidf_df.bigram_text.apply(lambda l: " ".join(l))

In [57]:
## instantize tfidfvectorizer from sklearn, set max_features to 500 to reduce dimensionality 
## and keep important tokens

vectorizer = TfidfVectorizer(max_features=500)

## fit and transform

X = vectorizer.fit_transform(tfidf_df.joined_text)
terms = vectorizer.get_feature_names()

In [58]:
tf_idf = pd.DataFrame(X.toarray(), columns = terms)

#### 2.6 Modeling

In [59]:
## SAME random state as embedding

X_train, X_test, y_train, y_test = train_test_split(tf_idf.values, tfidf_df.iloc[:,5:-4], 
                                                    test_size=0.3,
                                                    random_state = 20)

In [60]:
## build a sequential model

model2 = Sequential()
model2.add(Dense(100,activation='relu',input_shape = (500,)))
model2.add(Dense(11, activation='sigmoid'))

In [61]:
## complie and fit

model2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model2.fit(X_train,y_train,batch_size=100,epochs=100,verbose=0)

<keras.callbacks.callbacks.History at 0x15d0aa050>

In [62]:
## save the predicted probabilities for use later

tfidf_pred_vectors_train = model2.predict(X_train)
tfidf_pred_vectors_test = model2.predict(X_test)

In [63]:
## save model

# model2.save('tfidf_model.h5')

In [64]:
## save vectorizer object

# save_obj(vectorizer, 'tfidf_vectorizer')

## 3. Combine them together
#### Average & Voting

In [65]:
## take weighted average of two predictied vectors for train and test

final_vectors_train = 0.4*emb_pred_vectors_train + 0.6*tfidf_pred_vectors_train
final_vectors_test = 0.4*emb_pred_vectors_test + 0.6*tfidf_pred_vectors_test

In [66]:
## convert the final predicted vectors to corresponding classes
## see function at the beginning for detials

train_pred_classes = get_pred_classes(final_vectors_train)
test_pred_classes = get_pred_classes(final_vectors_test)

In [67]:
## get true classess for each observation
## see function at the beginning for details

train_true_classes = get_true_classes(y_train)
test_true_classes = get_true_classes(y_test)

In [68]:
## note the function check if there is intersection of predicted classes and true classes
## so there would be false positives; after checking, not too much
## if only output on class, training accuracy would be around 96% and test accuracy would be around 90%

## if we output two classes for each product
## training accuracy

compare(train_pred_classes,train_true_classes)

0.9997362869198312

In [69]:
## if output 2 classes
## test accuracy

compare(test_pred_classes,test_true_classes)

0.9618696186961869