# Create the vdb

vector database creation


In [1]:
import pandas as pd 
import random
import numpy as np
from annoy import AnnoyIndex

In [2]:
PATH_DATA = "../data/"
PATH_ARTIFACTS = "../artifacts/"

## Get Data

In [3]:
df_raw = pd.read_csv(PATH_DATA+'basket_data.csv')

In [4]:
df_raw.sample(5)

Unnamed: 0,order_id,order_number,user_id,alcohol,babies,bakery,beverages,breakfast,bulk,canned goods,...,household,international,meat seafood,missing,other,pantry,personal care,pets,produce,snacks
1418176,1509643,2,61631,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2105582,2240972,21,152566,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0
118875,126523,3,204444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,10.0,0.0
2248319,2392851,1,9748,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,2.0
2364957,2516795,5,152283,0.0,0.0,0.0,0.0,2.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0


In [5]:
df_raw[df_raw.user_id==82654].sort_values("order_number")

Unnamed: 0,order_id,order_number,user_id,alcohol,babies,bakery,beverages,breakfast,bulk,canned goods,...,household,international,meat seafood,missing,other,pantry,personal care,pets,produce,snacks
421261,448426,1,82654,0.0,0.0,0.0,2.0,2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2537093,2699982,2,82654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3167545,3370660,3,82654,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1318934,1404014,4,82654,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1075308,1144799,5,82654,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3024782,3218833,6,82654,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0
2862769,3046461,7,82654,0.0,0.0,1.0,1.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1920142,2043688,8,82654,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0


In [6]:
df_raw.shape

(3214874, 24)

In [7]:
df_raw.columns

Index(['order_id', 'order_number', 'user_id', 'alcohol', 'babies', 'bakery',
       'beverages', 'breakfast', 'bulk', 'canned goods', 'dairy eggs', 'deli',
       'dry goods pasta', 'frozen', 'household', 'international',
       'meat seafood', 'missing', 'other', 'pantry', 'personal care', 'pets',
       'produce', 'snacks'],
      dtype='object')

In [8]:
# x = df_raw.sum()
# x.sort_values()

In [9]:
for c in ["bulk","other","missing"]:
    df_raw.drop(c,axis=1,inplace=True)

In [10]:
df_raw.columns

Index(['order_id', 'order_number', 'user_id', 'alcohol', 'babies', 'bakery',
       'beverages', 'breakfast', 'canned goods', 'dairy eggs', 'deli',
       'dry goods pasta', 'frozen', 'household', 'international',
       'meat seafood', 'pantry', 'personal care', 'pets', 'produce', 'snacks'],
      dtype='object')

In [11]:
entity_cols = list(set(df_raw.columns) - set(["order_id","order_number","user_id"]))
entity_cols.sort()

In [12]:
entity_cols

['alcohol',
 'babies',
 'bakery',
 'beverages',
 'breakfast',
 'canned goods',
 'dairy eggs',
 'deli',
 'dry goods pasta',
 'frozen',
 'household',
 'international',
 'meat seafood',
 'pantry',
 'personal care',
 'pets',
 'produce',
 'snacks']

In [13]:
df_raw["total_item"] = df_raw[entity_cols].sum(axis=1)
df_raw = df_raw[df_raw.total_item>1]

In [14]:
entity_cols2 = []
for c in entity_cols:
    df_raw[c+"_v"] = df_raw[c]/df_raw["total_item"]
    entity_cols2.append(c+"_v") 

In [15]:
df_raw

Unnamed: 0,order_id,order_number,user_id,alcohol,babies,bakery,beverages,breakfast,canned goods,dairy eggs,...,dry goods pasta_v,frozen_v,household_v,international_v,meat seafood_v,pantry_v,personal care_v,pets_v,produce_v,snacks_v
0,2,3,202279,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.555556,0.000000,0.0,0.333333,0.000000
1,3,16,205970,0.0,0.0,1.0,0.0,0.0,0.0,3.0,...,0.000000,0.000000,0.000000,0.000000,0.125000,0.000000,0.000000,0.0,0.375000,0.000000
2,4,36,178520,0.0,0.0,1.0,3.0,4.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.076923,0.0,0.000000,0.307692
3,5,42,156122,0.0,0.0,0.0,1.0,0.0,0.0,3.0,...,0.076923,0.000000,0.115385,0.038462,0.038462,0.076923,0.038462,0.0,0.269231,0.153846
4,6,4,22352,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000000,0.000000,0.666667,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3214868,3421078,10,70170,0.0,0.0,0.0,0.0,2.0,0.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.222222,0.333333
3214870,3421080,2,52726,0.0,0.0,0.0,1.0,0.0,0.0,5.0,...,0.000000,0.111111,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.222222,0.000000
3214871,3421081,1,117076,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.142857,0.000000,0.0,0.000000,0.142857
3214872,3421082,23,175185,0.0,0.0,0.0,0.0,1.0,0.0,2.0,...,0.000000,0.000000,0.000000,0.000000,0.142857,0.000000,0.000000,0.0,0.285714,0.142857


## Data Split

In [16]:
len(df_raw.user_id.unique())

205491

In [17]:
user_more_3 = df_raw.groupby("user_id").order_id.count()
user_more_3 = user_more_3[user_more_3>3].index

In [18]:
len(user_more_3)

175948

In [19]:
# random.seed(129)
myrand = random.Random(129)
all_users = df_raw.user_id.unique()
n_train = int(len(user_more_3)*0.85)+1
user_train = myrand.choices(user_more_3,k=n_train)
user_test = set(user_more_3) - set(user_train)

In [20]:
len(user_train), len(user_test)

(149556, 75019)

In [21]:
df_train = df_raw[df_raw.user_id.isin(user_train)]
df_test = df_raw[df_raw.user_id.isin(user_test)]
df_train.shape, df_test.shape

((1702404, 40), (1272240, 40))

In [22]:
df_train["order_number_reversed"] = df_train.sort_values(['order_number'], ascending=[False]) \
             .groupby(['user_id']) \
             .cumcount() + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["order_number_reversed"] = df_train.sort_values(['order_number'], ascending=[False]) \


In [23]:
# df_test.to_csv(PATH_DATA+"order_level_test_diff_users.csv",index=False)
# df_train2.to_csv(PATH_DATA+"order_level_test_same_users.csv",index=False)

## Vector Creation

In [24]:
users_embedding = df_train.groupby("user_id")[entity_cols2].mean()
users_embedding.shape

(100929, 18)

In [25]:
users_embedding.to_csv(PATH_DATA+"users_embedding.csv",index=False)

In [26]:
users_embedding.sample(10)

Unnamed: 0_level_0,alcohol_v,babies_v,bakery_v,beverages_v,breakfast_v,canned goods_v,dairy eggs_v,deli_v,dry goods pasta_v,frozen_v,household_v,international_v,meat seafood_v,pantry_v,personal care_v,pets_v,produce_v,snacks_v
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
96428,0.0,0.0,0.0,0.026316,0.0,0.013158,0.25,0.013158,0.041667,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.614035,0.0
129620,0.0,0.0,0.047434,0.03171,0.023529,0.033981,0.176997,0.06938,0.011345,0.041439,0.020407,0.0,0.016112,0.054517,0.008289,0.0,0.329685,0.135175
105263,0.0,0.0,0.185185,0.166667,0.0,0.0,0.188889,0.283333,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.0,0.0,0.138889
197289,0.0,0.0,0.136508,0.195397,0.0,0.014286,0.35254,0.036508,0.0,0.104127,0.089841,0.022222,0.0,0.0,0.0,0.0,0.048571,0.0
82797,0.0,0.0,0.036951,0.05865,0.024074,0.018981,0.116209,0.026436,0.03761,0.012446,0.0,0.002778,0.0,0.01399,0.0,0.0,0.569659,0.082217
174869,0.0,0.254986,0.0,0.0,0.0,0.032389,0.075877,0.019231,0.0,0.23772,0.0,0.0,0.015625,0.0,0.0,0.0,0.195244,0.168928
201552,0.0,0.0,0.054945,0.057692,0.0,0.143887,0.074176,0.019231,0.105426,0.0,0.019231,0.038462,0.019231,0.019231,0.0,0.0,0.29533,0.153159
129602,0.0,0.009804,0.150659,0.011905,0.0,0.047304,0.218166,0.009804,0.080159,0.040476,0.0,0.0,0.0,0.040359,0.0,0.0,0.391363,0.0
36224,0.0,0.0,0.03,0.226667,0.155556,0.0,0.161111,0.0,0.0,0.191111,0.0,0.0,0.0,0.0,0.0,0.0,0.038889,0.196667
8200,0.0,0.02381,0.0,0.037879,0.0,0.011905,0.312822,0.011905,0.087302,0.099052,0.0,0.020408,0.0,0.02886,0.0,0.0,0.237425,0.128633


In [27]:
dimension_length = users_embedding.shape[1]
vdb = AnnoyIndex(dimension_length, 'angular')
for i,v in users_embedding.iterrows():
    vdb.add_item(i, v.values)

vdb.build(15)


True

In [28]:
vdb.get_nns_by_item(39,5)

[39, 106987, 113409, 44992, 149916]

In [29]:
v_input = df_train[df_train.order_id == 25][entity_cols2].values[0]
# 59897

vdb.get_nns_by_vector(v_input,5)

[28210, 112499, 75994, 115014, 104613]

In [30]:
vdb.save(PATH_ARTIFACTS+'vdb_demo.ann')

True

## Generate Recommendation

In [31]:
def mark_items(dchoose,n_mark=1):
    dtemp = dchoose.transpose()
    cols = dtemp.columns[0]
    dtemp = dtemp.sort_values(cols,ascending=False)
    not_zero_dept = dtemp[dtemp[cols]>0].index
    make_cols_zero = random.choices(not_zero_dept,k=n_mark)
    dchoose[make_cols_zero] = 0
    denom = dchoose.values.sum()
    dchoose = dchoose/denom
    return make_cols_zero,dchoose.fillna(0)

def department_recommend(dchoose):
    '''
    dchoose : dataframe of particular order with column of department only
    entity_cols : list of department only columns
    vdb : Annoy object
    '''

    v_input = dchoose.values[0]
    near_users = vdb.get_nns_by_vector(v_input,15)
    x = dchoose.reset_index(drop=True).transpose()[0]
    non_zero_department = x[x>0].index
    allv = []
    for v in near_users:
        allv.append(vdb.get_item_vector(v))
    allv = np.array(allv)
    recommend = pd.Series(allv.mean(axis=0), 
                          index=entity_cols).sort_values(
                            ascending=False)
    return recommend.drop(labels=non_zero_department)[:5]

In [32]:
order_sample = df_train.sample().order_id.values[0]
sample_eval = df_train[df_train.order_id == order_sample][entity_cols2]
true_answer, sample_input = mark_items(sample_eval)
pred = department_recommend(sample_input)
pred_answer = pred.index.tolist()

true_answer[0] in pred_answer

True

## Evaluations

In [33]:
# def bulk_process(order_samples):
#     eval1 = []
#     for order_sample in order_samples:
#         sample_eval = df_train2[df_train2.order_id == order_sample][entity_cols2]
#         try:
#             true_answer, sample_input = mark_items(sample_eval)
#             pred = department_recommend(sample_input)
#             pred_answer = pred.index.tolist()
#             score = 1 if true_answer[0] in pred_answer else 0
#         except:
#             print(order_sample)
#             score = 0
#             pass
#         eval1.append(score)
#     return eval1

In [34]:
df_train.order_id.values.shape

(1702404,)

In [35]:
# 4 ms 

In [78]:
rand_list = np.ceil((df_train[entity_cols2].mean(axis=0).sort_values(ascending=False)*300))
rand_dept_list = []
for d in rand_list.index:
    tmp = [d]*int(rand_list[d])
    rand_dept_list.extend(tmp)

In [80]:
# zz = []
# for i in range(50000):
#     m = []
#     while len(m)<5:
#         m0 = random.choices(rand_dept_list,k=5-len(m))
#         m = list(set(m) | set(m0))
#     zz.extend(m)

# xx = pd.Series(zz).value_counts()
# xx/xx.sum()

In [90]:
eval2 = []
eval2_rand = []
for order_sample in df_test.sample(7500).order_id.values:
    sample_eval = df_test[df_test.order_id == order_sample][entity_cols2]
    try:
        true_answer, sample_input = mark_items(sample_eval)
        pred = department_recommend(sample_input)
        pred_answer = pred.index.tolist()
        score = 1 if true_answer[0] in pred_answer else 0

        m = []
        while len(m)<5:
            m0 = random.choices(list(set(rand_dept_list)-set(m)),k=5-len(m))
            m = list(set(m) | set(m0))

        score_rand = 1 if true_answer[0] in m else 0
    except:
        print(order_sample)
        score = 0
        score_rand = 0
        pass
    
    eval2.append(score)
    eval2_rand.append(score_rand)

In [None]:
np.mean(eval2)

0.696

In [None]:
np.mean(eval2_rand)

0.286