In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten, LSTM
from keras.layers import Embedding
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.layers import Input
from keras.layers.merge import Concatenate
import tensorflow as tf
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

Using TensorFlow backend.


In [2]:
import pickle
from keras.models import load_model

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

## 1. Import data: Combine & Merge Tables

In [3]:
fd = pd.read_csv("full_data.csv")
fd = fd.iloc[:,[0,1,3,4,5,9,10]]
print(fd.shape)

(48979, 7)


In [4]:
tag = pd.read_excel("tagged.xlsx")
extra_tag = pd.read_csv("additional_tags.csv")
all_tag = pd.concat([tag,extra_tag])

In [5]:
merge = fd.merge(all_tag,left_on='product_id',right_on='product_id',how="inner")
merge.shape

(159013, 10)

## 2. Prepare Data

#### Filter out records for Attribute "Occasion"

In [6]:
data = merge[merge["attribute_name"]=="occasion"]
for i in data.columns:
    data[i] = data[i].str.lower()
print(data.shape)

(21263, 10)


In [7]:
# Check unique attribute value for occasion
data['attribute_value'].unique()

array(['work', 'day to night', 'daytonight', 'weekend', 'vacation',
       'nightout', 'coldweather', 'night out', 'workout'], dtype=object)

In [8]:
def regex_cleaning(value):
    new_value = re.sub("nightout",'night out', value)
    new_value1 = re.sub("daytonight", 'day to night', new_value)
    new_value2 = re.sub("coldweather", 'cold weather', new_value1)
    return new_value2
data['attribute_value'] = data['attribute_value'].apply(regex_cleaning)

In [9]:
data['attribute_value'].unique()
# Check again, we got in total 7 different occasion categories

array(['work', 'day to night', 'weekend', 'vacation', 'night out',
       'cold weather', 'workout'], dtype=object)

In [10]:
# Create entity group: Combine product_id and product_color_id
data['id'] = data['product_id']+data['product_color_id']

In [11]:
# Check whether we have duplicate records
data.groupby('id')['attribute_value'].count()

id
01dpc9gstt72khnn0mndnkh7rd01dpc9gstzypv54n9b1tbb6h1x     4
01dpc9gstt72khnn0mndnkh7rd01dpc9gtp1qdcw3exqttr6ad94     4
01dpc9gstt72khnn0mndnkh7rd01dpc9gvg1zn8rrax77s27jrzs     4
01dpcb2keavxxkfvm7fxbne4vy01dpcb2kef1cfywfwa97d3vn11    10
01dpcb2keavxxkfvm7fxbne4vy01dpcb2mmtpctv3rnjv2bbaf28    10
                                                        ..
01e6078g3gratf2c96vkyywsgd01e6078g42kzpg57ncp4yg1tbb     1
01e6078g3gratf2c96vkyywsgd01e6078m3mmsaetkp9pd5dprr2     1
01e6079dg58yw9k78d57c6j2y101e6079dgrr4chkcpkny96g15p     3
01e6079qfkh4hpzfq31t6wdrrx01e6079qg5gtp5jyrcwgrwy4zc     1
01e607bhrqajdz76mjfn7rprk101e607bhsbcwh034nx5tmjh3z7     2
Name: attribute_value, Length: 5411, dtype: int64

In [12]:
# remove duplicate records
data.drop_duplicates(subset =["id",'attribute_name','attribute_value'],
                     keep = "first",inplace = True) 
data.shape

(12625, 11)

#### Create dummies for response variable

In [13]:
dummies = pd.get_dummies(data['attribute_value'])
dummies.head()

Unnamed: 0,cold weather,day to night,night out,vacation,weekend,work,workout
6,0,0,0,0,0,1,0
20,0,1,0,0,0,0,0
31,0,0,0,0,0,1,0
44,0,1,0,0,0,0,0
47,0,0,0,0,0,1,0


In [14]:
data = pd.concat([data, dummies], axis=1, sort=False)

In [15]:
#distribution of each occasion attribute value
data['attribute_value'].value_counts()

weekend         4159
day to night    3804
work            1872
night out       1210
vacation         973
cold weather     348
workout          259
Name: attribute_value, dtype: int64

In [16]:
# A same product may belongs to multiple occasions
# Therefore, we sum dummies together for each product
response = data.groupby('id').agg({'cold weather':'sum','day to night':'sum',
                                 'night out':'sum','vacation':'sum',
                                 'weekend':'sum','work':'sum','workout':'sum'}).reset_index()
print(response.shape)

(5411, 8)


In [17]:
combine = data.iloc[:,:11].merge(response,left_on='id',right_on='id')

In [18]:
combine.drop_duplicates(subset =["id"],keep = "first",inplace = True) 
combine.shape

(5411, 18)

In [19]:
combine = combine.set_index('id')

#### Preprocess Feature columns

In [20]:
# drop records with both null value in description and details
combine.dropna(subset=["description", "details"], how='all',inplace =True)

In [21]:
combine.isnull().sum()

product_id               0
brand                    0
product_full_name        0
description            380
brand_category         337
brand_canonical_url      0
details                397
product_color_id         0
attribute_name           0
attribute_value          0
cold weather             0
day to night             0
night out                0
vacation                 0
weekend                  0
work                     0
workout                  0
dtype: int64

In [22]:
# Combine details and description into one "Text" column
combine= combine.replace(np.nan, 'UNKNOWN_TOKEN', regex=True)
combine['details'] = combine['details'].str.replace("\n", "")
combine['text'] = combine['description']+' '+combine['details']

In [23]:
# Preprocess for Text info
import spacy
nlp = spacy.load('en_core_web_md')

def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    #remove stopwords and do lemmatization
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return " ".join(tokens)

In [24]:
combine['text'] = combine['text'].apply(preprocess_text)
combine['product_full_name'] = combine['product_full_name'].apply(preprocess_text)
combine['brand_category'] = combine['brand_category'].apply(preprocess_text)

In [25]:
def preprocess_url(url):
    # Remove http//:www.
    url = re.sub('https://www.', '', url)
    # Remove .com
    url = re.sub('.com', '', url)
    # Remove separator "/"
    url = re.sub('/', ' ', url)
    # Remove separator "-"
    url = re.sub('-', ' ', url)
    # Remove numbers
    url = re.sub(r'[0-9]+', ' ', url)
    # Remove Single character
    url = re.sub(r"\s+[a-zA-Z]\s+", ' ', url)
    # Removing multiple spaces
    url = re.sub(r'\s+', ' ', url)
    # Remove stopwords and do lemmatization
    doc = nlp(url)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return " ".join(tokens)

In [26]:
combine['brand_canonical_url'] = combine['brand_canonical_url'].apply(preprocess_url)

In [27]:
# Combine all brand info together
combine['brand_info'] = combine['brand']+' '+combine['product_full_name']+' '+\
                        combine['brand_category']+' '+combine['brand_canonical_url']

In [28]:
# Filter out useful columns
df = combine.loc[:,["text","brand_info","cold weather","day to night","night out",
                 "vacation","weekend","work","workout"]]

In [29]:
# check one product
df[df.index=="01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs2pz6nj"]

Unnamed: 0_level_0,text,brand_info,cold weather,day to night,night out,vacation,weekend,work,workout
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01dtjcerf6f4nrz2wsjffa1eys01dtjcergehdmq5fajqs2pz6nj,beige stretch silk slip silk spandex dry clean...,theory teah stretch silk camisole clothing top...,0,1,1,0,1,1,0


____

## 3. Embedding Model: Description & Detail

In [30]:
docs = df['text']
labels = df.iloc[:,2:]
X = docs
y = labels.values

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3,random_state=42)

In [31]:
tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(X_train)

X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
vocab_size

4487

In [32]:
save_obj(tokenizer, "occasion_tokeniver")

In [33]:
from keras.preprocessing.text import text_to_word_sequence
from typing import List
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))
max_length = get_max_token_length_per_doc(docs)
max_length

165

In [34]:
X_train_pad = pad_sequences(X_train_token, padding='post', maxlen=max_length)
X_test_pad = pad_sequences(X_test_token, padding='post', maxlen=max_length)

In [35]:
X_train_pad.shape

(3779, 165)

In [36]:
# using GloVe word embeddings to convert text inputs 
# to their numeric counterparts.

def load_glove_vectors():
    embeddings_index = {}
    with open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index

embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


In [37]:
EMBEDDING_SIZE = 100
embedding_matrix = zeros((vocab_size, EMBEDDING_SIZE))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [38]:
embedding_matrix.shape

(4487, 100)

In [39]:
# Predictive model
embedding_model = Sequential()
embedding_model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
embedding_model.add(Flatten())
embedding_model.add(Dense(100,activation='relu'))
embedding_model.add(Dense(7, activation='sigmoid'))

embedding_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [40]:
embedding_model.fit(X_train_pad, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a76400a90>

In [42]:
# Predict on test dataset
score = embedding_model.evaluate(X_test_pad, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.4790511895844966
Test Accuracy: 0.8691357970237732


In [43]:
cols_target = ["cold weather","day to night","night out","vacation","weekend","work","workout"]
pred = embedding_model.predict(X_test_pad)
embedding_result = pd.DataFrame(data= pred, columns = cols_target,index=X_test.index)
embedding_result

Unnamed: 0_level_0,cold weather,day to night,night out,vacation,weekend,work,workout
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01e600eckpav431bzqjem5v4wy01e600eg7dzgakzjghgkrfvyb3,0.008177,0.999841,0.000030,3.502961e-03,0.999482,0.006648,7.354227e-08
01e1jm16ncm11qc1gsmkh3erch01e1jm16ns1wtkcph80zeswkx3,0.019867,0.881802,0.320390,2.367457e-03,0.940802,0.087279,5.381181e-05
01e2ky9h2267fxc9d382f077jd01e2ky9h4j0ffd2zcf826e8zm5,0.790867,0.169226,0.030774,3.894449e-03,0.512444,0.992360,1.501262e-03
01e4edbbqfaxnjdz93scvnnw8n01e4edbbqsx9zk71wd5vqe5yqe,0.000035,0.000059,0.002021,9.968250e-01,0.999457,0.000567,1.653355e-06
01e2m0qx4j85htkq2136q0tbvb01e2m0qx4xx4zpm06rj5888gzk,0.000018,0.069089,0.026928,3.313382e-01,0.681317,0.018257,3.606007e-01
...,...,...,...,...,...,...,...
01e2m4heymjm0f3ygprsxz8c0d01e2m4hez04zkv0h81824s23zt,0.001494,0.995764,0.936879,1.095006e-02,0.823323,0.201838,7.376524e-04
01e2kyfmf75neaxyhxc8vypad801e2kyfsrwnxya6y39c6d7gvk8,0.156844,0.953654,0.090436,8.644703e-05,0.024798,0.989490,3.447092e-04
01e4rts300d1qvsgtedt3kp7wz01e4rts309b93wvg0vj2krscwc,0.000005,0.999988,0.000761,4.908202e-04,0.982979,0.001584,1.334264e-04
01e6032gd90g1s3n5e8gdbz13201e6032gdy370ygmw992m7cphm,0.000002,0.883102,0.199940,3.925728e-03,0.977595,0.003012,9.642282e-06


In [44]:
embedding_model.save("occasion_embedding_model.h5")

____

## 4. Vectorization Model: Brand, Name, URL, Brand Category

In [45]:
info = df['brand_info']
labels = df.iloc[:,2:]
X = info
y = labels

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3,random_state=42)

In [46]:
## TF-IDF Vectorizer
train_X = list(X_train.values)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=3, 
                             max_df=0.9,
                             token_pattern=r'\w{3,}',
                             stop_words='english')
train_vector = vectorizer.fit_transform(train_X)
tfidf_df = pd.DataFrame(train_vector.toarray(), columns=vectorizer.get_feature_names())
tfidf_df

Unnamed: 0,accessory,acg,achille,acid,acler,aeyde,agence,agni,agolde,air,...,yolie,york,zadig,zebra,zermatt,zeynep,zimmermann,zip,zoom,étoile
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
save_obj(vectorizer, "occasion_vectorizer")

In [48]:
test_vector = vectorizer.transform(X_test)

In [49]:
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

vector_model = LogisticRegression(C=12.0)
cols_target = ["cold weather","day to night","night out","vacation","weekend","work","workout"]
vector_result = pd.DataFrame(columns = cols_target, index = X_test.index)

for label in cols_target:
    print('Predicting {}'.format(label))
    train_y = y_train[label]
    vector_model.fit(train_vector, train_y)
    pred_train = vector_model.predict(train_vector)
    print('Training accuracy is {}'.format(accuracy_score(train_y, pred_train)))
    
    test_y_prob = vector_model.predict_proba(test_vector)[:,1]
    test_y = y_test[label]
    pred_test = vector_model.predict(test_vector)
    print('Testing accuracy is {}'.format(accuracy_score(test_y, pred_test)))
    vector_result[label] = test_y_prob
    filename = "{}".format(label)+"_vector_model"
    save_obj(vector_model,filename)

Predicting cold weather
Training accuracy is 0.9790949986768986
Testing accuracy is 0.971604938271605
Predicting day to night
Training accuracy is 0.9010320190526594
Testing accuracy is 0.821604938271605
Predicting night out
Training accuracy is 0.9065890447208256
Testing accuracy is 0.8296296296296296
Predicting vacation
Training accuracy is 0.9367557554908706
Testing accuracy is 0.8814814814814815
Predicting weekend
Training accuracy is 0.9097644879597777
Testing accuracy is 0.8364197530864198
Predicting work
Training accuracy is 0.9203492987562847
Testing accuracy is 0.8401234567901235
Predicting workout
Training accuracy is 0.9936491135220958
Testing accuracy is 0.9839506172839506


In [50]:
vector_result

Unnamed: 0_level_0,cold weather,day to night,night out,vacation,weekend,work,workout
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01e600eckpav431bzqjem5v4wy01e600eg7dzgakzjghgkrfvyb3,0.510627,0.956440,0.328873,0.001694,0.762661,0.902612,0.001063
01e1jm16ncm11qc1gsmkh3erch01e1jm16ns1wtkcph80zeswkx3,0.007869,0.530110,0.311398,0.072296,0.797711,0.094975,0.003338
01e2ky9h2267fxc9d382f077jd01e2ky9h4j0ffd2zcf826e8zm5,0.519297,0.937468,0.543377,0.003200,0.911415,0.302649,0.045014
01e4edbbqfaxnjdz93scvnnw8n01e4edbbqsx9zk71wd5vqe5yqe,0.001768,0.014783,0.003960,0.482873,0.881111,0.092727,0.001885
01e2m0qx4j85htkq2136q0tbvb01e2m0qx4xx4zpm06rj5888gzk,0.000805,0.775752,0.027485,0.122726,0.868422,0.030798,0.003711
...,...,...,...,...,...,...,...
01e2m4heymjm0f3ygprsxz8c0d01e2m4hez04zkv0h81824s23zt,0.000422,0.931602,0.070340,0.023082,0.450124,0.461679,0.000338
01e2kyfmf75neaxyhxc8vypad801e2kyfsrwnxya6y39c6d7gvk8,0.005459,0.933230,0.225142,0.028371,0.307340,0.682354,0.002641
01e4rts300d1qvsgtedt3kp7wz01e4rts309b93wvg0vj2krscwc,0.006875,0.802226,0.008146,0.017612,0.964517,0.019886,0.010089
01e6032gd90g1s3n5e8gdbz13201e6032gdy370ygmw992m7cphm,0.001570,0.780271,0.140752,0.106139,0.727104,0.185718,0.014595


_____

## 5. Combine embedding model and vector model together

In [51]:
combined_result = 0.4*embedding_result+0.6*vector_result
combined_result

Unnamed: 0_level_0,cold weather,day to night,night out,vacation,weekend,work,workout
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01e600eckpav431bzqjem5v4wy01e600eg7dzgakzjghgkrfvyb3,0.309647,0.973800,0.197336,0.002417,0.857389,0.544226,0.000638
01e1jm16ncm11qc1gsmkh3erch01e1jm16ns1wtkcph80zeswkx3,0.012669,0.670787,0.314995,0.044324,0.854948,0.091896,0.002024
01e2ky9h2267fxc9d382f077jd01e2ky9h4j0ffd2zcf826e8zm5,0.627925,0.630172,0.338336,0.003478,0.751827,0.578534,0.027609
01e4edbbqfaxnjdz93scvnnw8n01e4edbbqsx9zk71wd5vqe5yqe,0.001075,0.008893,0.003184,0.688454,0.928450,0.055863,0.001131
01e2m0qx4j85htkq2136q0tbvb01e2m0qx4xx4zpm06rj5888gzk,0.000491,0.493087,0.027262,0.206171,0.793580,0.025782,0.146467
...,...,...,...,...,...,...,...
01e2m4heymjm0f3ygprsxz8c0d01e2m4hez04zkv0h81824s23zt,0.000851,0.957267,0.416956,0.018229,0.599403,0.357742,0.000498
01e2kyfmf75neaxyhxc8vypad801e2kyfsrwnxya6y39c6d7gvk8,0.066013,0.941400,0.171259,0.017057,0.194323,0.805208,0.001722
01e4rts300d1qvsgtedt3kp7wz01e4rts309b93wvg0vj2krscwc,0.004127,0.881331,0.005192,0.010764,0.971902,0.012565,0.006107
01e6032gd90g1s3n5e8gdbz13201e6032gdy370ygmw992m7cphm,0.000943,0.821403,0.164427,0.065254,0.827300,0.112635,0.008761


In [52]:
# Get weighted-avg predicted probability from 2 models
combined_result = pd.DataFrame(index=vector_result.index,
                              columns=cols_target)
for label in cols_target:
    combined_result[label] = 0.4*embedding_result[label]+0.6*vector_result[label]

combined_result

Unnamed: 0_level_0,cold weather,day to night,night out,vacation,weekend,work,workout
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01e600eckpav431bzqjem5v4wy01e600eg7dzgakzjghgkrfvyb3,0.309647,0.973800,0.197336,0.002417,0.857389,0.544226,0.000638
01e1jm16ncm11qc1gsmkh3erch01e1jm16ns1wtkcph80zeswkx3,0.012669,0.670787,0.314995,0.044324,0.854948,0.091896,0.002024
01e2ky9h2267fxc9d382f077jd01e2ky9h4j0ffd2zcf826e8zm5,0.627925,0.630172,0.338336,0.003478,0.751827,0.578534,0.027609
01e4edbbqfaxnjdz93scvnnw8n01e4edbbqsx9zk71wd5vqe5yqe,0.001075,0.008893,0.003184,0.688454,0.928450,0.055863,0.001131
01e2m0qx4j85htkq2136q0tbvb01e2m0qx4xx4zpm06rj5888gzk,0.000491,0.493087,0.027262,0.206171,0.793580,0.025782,0.146467
...,...,...,...,...,...,...,...
01e2m4heymjm0f3ygprsxz8c0d01e2m4hez04zkv0h81824s23zt,0.000851,0.957267,0.416956,0.018229,0.599403,0.357742,0.000498
01e2kyfmf75neaxyhxc8vypad801e2kyfsrwnxya6y39c6d7gvk8,0.066013,0.941400,0.171259,0.017057,0.194323,0.805208,0.001722
01e4rts300d1qvsgtedt3kp7wz01e4rts309b93wvg0vj2krscwc,0.004127,0.881331,0.005192,0.010764,0.971902,0.012565,0.006107
01e6032gd90g1s3n5e8gdbz13201e6032gdy370ygmw992m7cphm,0.000943,0.821403,0.164427,0.065254,0.827300,0.112635,0.008761


In [53]:
# Final Decision Threshold:
## If a product's probability of belonging to an occasion is >0.5, assign occasion value
## If none of occasion probability is >0.5 for a product, assign with occasion with highest probability

def decision(probs):
    if sum(probs>0.5)>0:
        probs[probs > 0.5] = 1
        probs[probs <= 0.5] = 0
    else:
        probs[probs == np.max(probs)] = 1
        probs[probs != np.max(probs)] = 0
    return probs

In [54]:
combined_result.apply(decision, axis=1)

Unnamed: 0_level_0,cold weather,day to night,night out,vacation,weekend,work,workout
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01e600eckpav431bzqjem5v4wy01e600eg7dzgakzjghgkrfvyb3,0.0,1.0,0.0,0.0,1.0,1.0,0.0
01e1jm16ncm11qc1gsmkh3erch01e1jm16ns1wtkcph80zeswkx3,0.0,1.0,0.0,0.0,1.0,0.0,0.0
01e2ky9h2267fxc9d382f077jd01e2ky9h4j0ffd2zcf826e8zm5,1.0,1.0,0.0,0.0,1.0,1.0,0.0
01e4edbbqfaxnjdz93scvnnw8n01e4edbbqsx9zk71wd5vqe5yqe,0.0,0.0,0.0,1.0,1.0,0.0,0.0
01e2m0qx4j85htkq2136q0tbvb01e2m0qx4xx4zpm06rj5888gzk,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
01e2m4heymjm0f3ygprsxz8c0d01e2m4hez04zkv0h81824s23zt,0.0,1.0,0.0,0.0,1.0,0.0,0.0
01e2kyfmf75neaxyhxc8vypad801e2kyfsrwnxya6y39c6d7gvk8,0.0,1.0,0.0,0.0,0.0,1.0,0.0
01e4rts300d1qvsgtedt3kp7wz01e4rts309b93wvg0vj2krscwc,0.0,1.0,0.0,0.0,1.0,0.0,0.0
01e6032gd90g1s3n5e8gdbz13201e6032gdy370ygmw992m7cphm,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [55]:
combined_result.columns = [str(col) + '_pred' for col in combined_result.columns]
combined_result = combined_result.astype(int)
combined_result

Unnamed: 0_level_0,cold weather_pred,day to night_pred,night out_pred,vacation_pred,weekend_pred,work_pred,workout_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01e600eckpav431bzqjem5v4wy01e600eg7dzgakzjghgkrfvyb3,0,1,0,0,1,1,0
01e1jm16ncm11qc1gsmkh3erch01e1jm16ns1wtkcph80zeswkx3,0,1,0,0,1,0,0
01e2ky9h2267fxc9d382f077jd01e2ky9h4j0ffd2zcf826e8zm5,1,1,0,0,1,1,0
01e4edbbqfaxnjdz93scvnnw8n01e4edbbqsx9zk71wd5vqe5yqe,0,0,0,1,1,0,0
01e2m0qx4j85htkq2136q0tbvb01e2m0qx4xx4zpm06rj5888gzk,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...
01e2m4heymjm0f3ygprsxz8c0d01e2m4hez04zkv0h81824s23zt,0,1,0,0,1,0,0
01e2kyfmf75neaxyhxc8vypad801e2kyfsrwnxya6y39c6d7gvk8,0,1,0,0,0,1,0
01e4rts300d1qvsgtedt3kp7wz01e4rts309b93wvg0vj2krscwc,0,1,0,0,1,0,0
01e6032gd90g1s3n5e8gdbz13201e6032gdy370ygmw992m7cphm,0,1,0,0,1,0,0


In [56]:
compare = pd.concat([combined_result, y_test], axis=1, sort=False)
compare

Unnamed: 0_level_0,cold weather_pred,day to night_pred,night out_pred,vacation_pred,weekend_pred,work_pred,workout_pred,cold weather,day to night,night out,vacation,weekend,work,workout
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
01e600eckpav431bzqjem5v4wy01e600eg7dzgakzjghgkrfvyb3,0,1,0,0,1,1,0,0,1,0,0,1,0,0
01e1jm16ncm11qc1gsmkh3erch01e1jm16ns1wtkcph80zeswkx3,0,1,0,0,1,0,0,0,1,0,0,1,0,0
01e2ky9h2267fxc9d382f077jd01e2ky9h4j0ffd2zcf826e8zm5,1,1,0,0,1,1,0,1,0,0,0,1,0,1
01e4edbbqfaxnjdz93scvnnw8n01e4edbbqsx9zk71wd5vqe5yqe,0,0,0,1,1,0,0,0,0,0,1,1,0,0
01e2m0qx4j85htkq2136q0tbvb01e2m0qx4xx4zpm06rj5888gzk,0,0,0,0,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
01e2m4heymjm0f3ygprsxz8c0d01e2m4hez04zkv0h81824s23zt,0,1,0,0,1,0,0,0,1,0,0,0,1,0
01e2kyfmf75neaxyhxc8vypad801e2kyfsrwnxya6y39c6d7gvk8,0,1,0,0,0,1,0,0,1,0,0,0,1,0
01e4rts300d1qvsgtedt3kp7wz01e4rts309b93wvg0vj2krscwc,0,1,0,0,1,0,0,0,1,0,0,1,0,0
01e6032gd90g1s3n5e8gdbz13201e6032gdy370ygmw992m7cphm,0,1,0,0,1,0,0,0,1,0,0,0,1,0


In [57]:
def accuracy(table):
    total_actual = len(compare)*7
    acc=0
    for index, row in table.iterrows():
        if row['cold weather_pred'] == row['cold weather']:
            acc+=1
        if row['day to night_pred'] == row['day to night']:
            acc+=1
        if row['night out_pred'] == row['night out']:
            acc+=1
        if row['vacation_pred'] == row['vacation']:
            acc+=1
        if row['weekend_pred'] == row['weekend']:
            acc+=1
        if row['work_pred'] == row['work']:
            acc+=1
        if row['workout_pred'] == row['workout']:
            acc+=1
    acc_rate = acc/total_actual
    return acc,acc_rate

In [58]:
accuracy(compare)

(10180, 0.8977072310405644)