In [105]:
import numpy as np
import pandas as pd
import utils
import pickle
from gensim.models import word2vec
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import SimpleRNN, Dropout
from keras.optimizers import RMSprop, Adam
from keras.utils import np_utils
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from scipy.sparse import csr_matrix
import random
import time

In [2]:
np.random.seed(7)

In [3]:
utils.data_directory()

On AWS instance
/home/ec2-user/data


In [4]:
ls

[0m[38;5;34maisles.csv[0m*                 [38;5;34morders.csv[0m*             test1.model.bin
[38;5;34mdepartments.csv[0m*            [38;5;34mproducts.csv[0m*           test1.model.txt
[38;5;34morder_products__prior.csv[0m*  [38;5;34msample_submission.csv[0m*  w2v_corpus_order
[38;5;34morder_products__train.csv[0m*  test1.model


In [5]:
order_products__prior = pd.read_csv("order_products__prior.csv")
order_products__train = pd.read_csv("order_products__train.csv")
order = pd.read_csv("orders.csv")

In [6]:
order.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [7]:
order_products__prior = pd.merge(order_products__prior[["order_id", "product_id", "reordered"]], 
                               order[["order_id", "user_id","order_number"]], on="order_id")
order_products__prior["product_id"] = order_products__prior["product_id"].astype("unicode")
#order_products__prior = order_products__prior[order_products__prior["reordered"]==1]

In [8]:
order_products__train = pd.merge(order_products__train, order[["order_id", "user_id","order_number"]],
                                 how="left", on="order_id")
order_products__train["product_id"] = order_products__train["product_id"].astype("unicode")

order_products__train_1 = order_products__train[order_products__train["reordered"]==1]
del order_products__train_1["add_to_cart_order"]
del order_products__train_1["reordered"]
order_products__train_1 = order_products__train_1[["user_id","order_id","order_number","product_id"]]

In [9]:
order_products__train_1.head()

Unnamed: 0,user_id,order_id,order_number,product_id
0,112108,1,4,49302
1,112108,1,4,11109
4,112108,1,4,43633
7,112108,1,4,22035
9,79431,36,23,19660


In [10]:
liste_user = order_products__train_1["user_id"].unique()
order_products__train_2 = order_products__train[order_products__train["reordered"]!=1].groupby(["user_id","order_id","order_number"]).sum().reset_index()
order_products__train_2 = order_products__train_2[~order_products__train_2["user_id"].isin(liste_user)]
order_products__train_2 = order_products__train_2.ix[:,:3]
order_products__train_2["product_id"] = "None"

In [11]:
order_products__train_2.head()

Unnamed: 0,user_id,order_id,order_number,product_id
5,10,1822501,6,
19,44,2436259,4,
20,47,2906490,6,
45,93,1179185,15,
57,112,3032922,27,


In [12]:
order_products__train = pd.concat([order_products__train_1, order_products__train_2], axis=0)

In [13]:
# on trie df sur les user_train present encore dans le train
liste_user = order.ix[order["eval_set"]=="train", "user_id"].unique()
order_products__prior = order_products__prior[order_products__prior["user_id"].isin(liste_user)]
order_products__train = order_products__train[order_products__train["user_id"].isin(liste_user)]

In [14]:
print(len(order_products__prior["user_id"].unique()))
print(len(order_products__train["user_id"].unique()))

131209
131209


In [15]:
order_products__prior = order_products__prior.sort_values(["user_id","order_number"])
order_products__train = order_products__train.sort_values(["user_id"])

In [16]:
#order_products__prior["user_id"].unique()
#order_products__train["user_id"].unique()

In [17]:
# load corpus
with open('w2v_corpus_order','rb') as f:
    sentences = pickle.load(f)

In [18]:
# load model 
model = word2vec.Word2Vec.load("test1.model")

In [19]:
# Create matrix items
index_N = model.wv.index2word
matrix_N = model.wv.syn0

In [20]:
print("Nombre de mot dans le vocab : ", len(model.wv.vocab.keys()))

Nombre de mot dans le vocab :  33500


In [21]:
# exemple corpus
sentences[0]

['49302', '11109', '43633', '22035']

In [22]:
def average_pooling(sentence, model):
    matrix_pooling = list(map(lambda x: model[x], filter(lambda x: x in model.wv.vocab.keys(),sentence)))
    return np.mean(matrix_pooling, axis=0)

def max_pooling(sentence, model):
    matrix_pooling = list(map(lambda x: model[x], filter(lambda x: x in model.wv.vocab.keys(),sentence)))
    return np.max(matrix_pooling, axis=0)

def items_to_latent(sentences, model, pooling_function=average_pooling):
    latent_list = list(map(lambda x : pooling_function(x, model) ,sentences))
    return np.array(latent_list)
    

In [23]:
# try to items_to_latent pour tout les user

In [24]:
sequences = order_products__prior.groupby(["user_id",'order_number']).apply(
    lambda order : average_pooling(order['product_id'].tolist(), model))

sequences = sequences.rename("sequence")

sequences = sequences.reset_index().groupby(["user_id"])["sequence"].apply(
    lambda x: list(filter(lambda x:  ~np.all(np.isnan(x)), list(x))) ).reset_index()

sequences["sequence"] = sequences["sequence"].map(lambda x: np.array(x, dtype = object))



In [25]:
print(sequences.shape)

(131209, 2)


In [27]:
label = order_products__train.groupby(["user_id",'order_number']).apply(
    lambda order : order['product_id'].tolist())
label = label.rename("label")
label = label.reset_index()
del label["order_number"]
label.head()

Unnamed: 0,user_id,label
0,1,"[46149, 49235, 26088, 13032, 10258, 39657, 264..."
1,2,"[33957, 7963, 41787, 22963, 22825, 24852, 3279..."
2,5,"[21616, 15349, 21413, 40706]"
3,7,"[17638, 37999, 13198, 43967, 45066, 29894, 472..."
4,8,"[41540, 21903, 23165, 15937]"


In [28]:
mlb =  MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(label["label"])

In [29]:
nb_classes = len(mlb.classes_)

In [30]:
#y[0,:].toarray()

In [43]:
n_index = label.index.values
n_sample = int(len(label.index.values)*0.7)

In [45]:
user_index = np.random.choice(n_index, n_sample, replace=False)

In [47]:
user_index_test = list(filter(lambda x: x not in user_index, n_index))

In [115]:
model_rnn = Sequential()
model_rnn.add(SimpleRNN(20, return_sequences=False, input_shape=(None, 20))) 
model_rnn.add(Dropout(0.2))
model_rnn.add(Dense(nb_classes))
#model.add(TimeDistributedDense(in_out_neurons))  
model_rnn.add(Activation("sigmoid"))
#optimizer = RMSprop(lr=0.01,decay=1e-6)
#optimizer = Adam(lr=0.01, decay=1e-6)
model_rnn.compile(loss='binary_crossentropy', optimizer="RMSprop")

In [None]:
n_epochs = 10 
for i in range(n_epochs) :
    print("Epochs : ",  str(i))
    i_user = 0
    start_time = time.time()
    for index in user_index :
        if i_user == 5000:
            print("Progress train 5000 user")
            print("--- %s seconds ---" % (time.time() - start_time))
            start_time = time.time()
            i_user = 0 

        X_train = sequences.ix[index, "sequence"]
        X_train = X_train.reshape((1, X_train.shape[0],X_train.shape[1]))
        y_train = y[index,:].toarray()
        model_rnn.fit(X_train, y_train, batch_size=1, epochs=1,shuffle=False)
        i_user += 1

Epochs :  0
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/

In [59]:
label_test = label.loc[user_index_test]
label_test["predict"] = np.nan

In [99]:
for index_test in user_index_test[0]:
    X_test = sequences.loc[4, "sequence"]
    X_test = X_test.reshape((1, X_test.shape[0],X_test.shape[1]))
    y_pred = model_rnn.predict_proba(X_test, batch_size=1)

TypeError: 'numpy.int64' object is not iterable

In [100]:
X_test = sequences.loc[30, "sequence"]
X_test = X_test.reshape((1, X_test.shape[0],X_test.shape[1]))
y_pred = model_rnn.predict_proba(X_test)



In [101]:
y_pred = model_rnn.predict_proba(X_test)



In [87]:
label_test

Unnamed: 0,user_id,label,predict
4,8,"[41540, 21903, 23165, 15937]",
7,13,"[19934, 4210, 27086, 27435]",
12,23,"[33819, 49306, 42372, 13544, 48205, 22021]",
13,24,[31222],
14,27,"[33787, 4920, 46676, 45446, 32263, 1323, 44932...",
20,41,"[25890, 29180, 4605, 28985]",
22,43,"[38154, 9360, 33780, 42431, 46981, 15599, 2252...",
24,46,"[42987, 24852, 21781, 13733, 4605, 24097]",
30,53,"[11210, 377]",
38,66,"[42014, 28842, 8143]",


In [102]:
sorted(list(zip(y_pred.ravel(), mlb.classes_)) , reverse=True)

[(0.12902252, '24852'),
 (0.10358275, '13176'),
 (0.066629983, '21137'),
 (0.06417571, 'None'),
 (0.059494253, '21903'),
 (0.047089182, '47209'),
 (0.046869606, '47766'),
 (0.044714596, '47626'),
 (0.036873486, '16797'),
 (0.032277092, '26209'),
 (0.031521376, '27966'),
 (0.030383805, '27845'),
 (0.02442502, '39275'),
 (0.023908634, '22935'),
 (0.023893155, '45007'),
 (0.022953767, '30391'),
 (0.022513544, '24964'),
 (0.021646071, '4920'),
 (0.020987296, '40706'),
 (0.019958138, '44632'),
 (0.019900527, '45066'),
 (0.019243505, '4605'),
 (0.018685006, '42265'),
 (0.018589795, '46979'),
 (0.0184313, '5876'),
 (0.01823967, '8518'),
 (0.017902575, '31717'),
 (0.017695101, '28204'),
 (0.016714137, '19057'),
 (0.016695024, '43352'),
 (0.016687887, '5450'),
 (0.01650566, '30489'),
 (0.016077494, '37646'),
 (0.015997456, '21616'),
 (0.014559191, '49235'),
 (0.01455443, '27086'),
 (0.014550415, '28985'),
 (0.013695003, '26604'),
 (0.013598111, '27104'),
 (0.013408944, '5077'),
 (0.013307924, '

In [118]:
df_user_order = order_products__prior[order_products__prior["user_id"]==156122]
df_user_train = order_products__train[order_products__train["user_id"]==156122]
sentences_user = df_user_order.groupby(['order_id']).apply(lambda order : order['product_id'].tolist()).values
sentences_user_train = df_user_train.groupby(['order_id']).apply(lambda order : order['product_id'].tolist()).values

In [119]:
sentences_user[0]

['13176',
 '15005',
 '47329',
 '27966',
 '23909',
 '48370',
 '13245',
 '9633',
 '27360',
 '6348',
 '40878',
 '6184',
 '48002',
 '20914',
 '37011',
 '12962',
 '45698',
 '24773',
 '18569',
 '41176',
 '48366']

In [120]:
df_user_order = order_products__prior[order_products__prior["user_id"]== 79431]
df_user_train = order_products__train[order_products__train["user_id"]== 79431]
sentences_user_2 = df_user_order.groupby(['order_id']).apply(lambda order : order['product_id'].tolist()).values
sentences_user_train_2 = df_user_train.groupby(['order_id']).apply(lambda order : order['product_id'].tolist()).values

X_2 = items_to_latent(sentences_user_2, model)
print(X_2.shape)

(20, 20)


In [16]:
len(sentences_user)

51

In [15]:
X = items_to_latent(sentences_user, model)
X

array([[ 0.43584523,  1.39446092, -1.76994991, ...,  3.45118093,
         0.39852539,  1.77256918],
       [-0.17984129,  2.27678227, -0.82998675, ...,  3.6141355 ,
        -0.36553669,  1.4399761 ],
       [ 0.54715687,  1.43413687, -2.65205169, ...,  4.18293715,
        -0.69035918,  2.10804296],
       ..., 
       [-0.14512646,  2.15235662, -0.79283273, ...,  3.04687262,
         2.7541945 ,  2.0695343 ],
       [ 0.46491203,  2.13337159, -0.44208297, ...,  2.63004231,
         0.24224021,  1.76829875],
       [ 0.37093857,  1.91117966, -2.78093529, ...,  5.40671968,
         0.81816739,  3.53505659]], dtype=float32)

In [216]:
#X_train_1 = X[:50].reshape((1, 50, 20))
#X_train_2 = X[:20].reshape((1, 20, 20))
#X_test_1 = X[50]
#X_test_2 = X[20]

In [217]:
#X_train_1.shape

(1, 50, 20)

In [219]:
#np.append(X_train_1, X_train_2, axis=1).shape

(1, 70, 20)

In [174]:
X_train = X[:50]
print(X_train.shape)
X_test = X[50]
print(X_test.shape)

(50, 20)
(20,)


In [158]:
X_train_2 = X_2[:19]
X_test_2 = X_2[19]

In [108]:
#np.append(sentences_user,sentences_user_train)

In [129]:
# On binarize le y et on le lag de 1 
yse = np.append(sentences_user[1:],sentences_user_train)
yse = np.append(yse, sentences_user_2[1:])
yse = np.append(yse, sentences_user_train_2)
mlb =  MultiLabelBinarizer()
y = mlb.fit_transform(yse)

In [130]:
y.shape

(71, 144)

In [148]:
sentences_user.shape

(51,)

In [147]:
y_train = y[:50]
print(y_train.shape)
y_test = y[50]
print(y_test.shape)

y_train_2 = y[51:70]
print(y_train_2.shape)
y_test_2 = y[-1]
print(y_test_2.shape)

(50, 144)
(144,)
(19, 144)
(144,)


In [166]:
nb_classes = len(mlb.classes_)

In [112]:
#model_rnn = Sequential()
#model_rnn.add(SimpleRNN(20, return_sequences=True, input_shape=(None, 20)))
#model_rnn.add(Activation("sigmoid"))

#X_predict = model_rnn.predict(X_train)
#X_predict = X_predict.reshape((X_predict.shape[1], X_predict.shape[2]))
#print(X_predict.shape)

#print("Shape matrice items : ", matrix_N.shape)
#print("Dernier bakset shape : ", X_predict.T[:, 50].shape)

#resultat = np.dot(matrix_N,  X_predict.T[:, 50])
#resultat

In [159]:
X_train = X_train.reshape((1,50,20))
y_train = y_train.reshape((1,50,144))

X_train_2 = X_train_2.reshape((1, 19, 20))
y_train_2 = y_train_2.reshape((1, 19, 144))

In [206]:
model_rnn = Sequential()
model_rnn.add(SimpleRNN(20, return_sequences=True, input_shape=(None, 20))) 
model_rnn.add(Dense(nb_classes))
#model.add(TimeDistributedDense(in_out_neurons))  
model_rnn.add(Activation("sigmoid"))
optimizer = RMSprop(lr=0.01,decay=1e-6)
#optimizer = Adam(lr=0.01, decay=1e-6)
model_rnn.compile(loss='binary_crossentropy', optimizer=optimizer)

In [207]:
#model_rnn.fit(X_train,y_train, batch_size=1, epochs=20, shuffle=False)

In [208]:
model_rnn.fit(X_train_2,y_train_2, batch_size=1, epochs=20, shuffle=False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ff3870d5ef0>

In [117]:
#X_test = X.reshape(1,51,20)

In [178]:
test = np.append(X_test, X_test_2)

In [181]:
test = test.reshape((2,1,20))

In [184]:
test

array([[[ 0.37093857,  1.91117966, -2.78093529, -1.37817276, -4.67820406,
          2.85568571,  3.63907647, -4.3820529 , -4.31110048,  0.55233264,
          1.33035266, -3.56414008,  1.45224214, -1.00300229, -2.47890902,
         -2.01374626,  0.0957862 ,  5.40671968,  0.81816739,  3.53505659]],

       [[ 0.08813759, -1.48052728, -0.67194152,  3.28099799,  3.07976317,
         -0.70134038,  0.06600093,  0.13747589,  2.22590089,  0.66900951,
         -1.53777313,  1.53165567,  0.24686305,  1.56487381,  1.87486517,
          2.23701453,  0.8863644 , -0.7513147 ,  2.65052652,  0.28793874]]], dtype=float32)

In [209]:
test_resultat = model_rnn.predict_proba(X_test_2.reshape((1,1,20)))



In [210]:
test_resultat.shape

(1, 1, 144)

In [211]:
mlb.inverse_transform(y_test_2.reshape(1, 144))

[('19660', '34497', '43086', '46620', '46979', '48679')]

In [212]:
sorted(list(zip(test_resultat.ravel(), mlb.classes_)) , reverse=True)

[(0.67834133, '19660'),
 (0.42289984, '20119'),
 (0.4019596, '38293'),
 (0.35094213, '42719'),
 (0.33773357, '26629'),
 (0.26741824, '35939'),
 (0.16829433, '45747'),
 (0.16086845, '5450'),
 (0.15349215, '24852'),
 (0.1424506, '32655'),
 (0.13644207, '33716'),
 (0.13625342, '44359'),
 (0.13334516, '21903'),
 (0.12855712, '43086'),
 (0.12471832, '24964'),
 (0.11474364, '27521'),
 (0.11305163, '581'),
 (0.10760273, '3893'),
 (0.10336071, '48679'),
 (0.087970428, '14129'),
 (0.086337544, '21616'),
 (0.082744598, '45007'),
 (0.081925832, '41950'),
 (0.079819545, '26209'),
 (0.076669909, '47734'),
 (0.075024195, '14704'),
 (0.074792147, '22227'),
 (0.071746416, '44303'),
 (0.071712025, '19048'),
 (0.071035236, '38456'),
 (0.070224389, '20761'),
 (0.069140144, '14197'),
 (0.060909439, '47699'),
 (0.059722438, '12339'),
 (0.059565824, '19878'),
 (0.058683727, '16759'),
 (0.057077084, '29133'),
 (0.055636737, '2763'),
 (0.054829389, '5586'),
 (0.054706421, '1940'),
 (0.054377213, '20985'),
 (0

In [58]:
preds = model_rnn.predict_proba(X_test).ravel()
preds[preds>=0.5] = 1
preds[preds<0.5] = 0



In [59]:
f1_score(y_test, preds)

0.28571428571428575