# Dependencies

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Import from projects

In [2]:
from propensity_prediction.tasks.converting_action_prediction.converting_action_prediction import Ensemble_NextAction

# Experiments

### Load data

In [3]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [4]:
!kaggle datasets download mkechinov/ecommerce-events-history-in-cosmetics-shop -f 2019-Dec.csv

import zipfile
zip_ref = zipfile.ZipFile('426888%2F1015349%2Fcompressed%2F2019-Dec.csv.zip', 'r')
zip_ref.extractall()
zip_ref.close()
!rm 426888%2F1015349%2Fcompressed%2F2019-Dec.csv.zip

Downloading 426888%2F1015349%2Fcompressed%2F2019-Dec.csv.zip to /src/ConvertingActionPrediction
 99%|█████████████████████████████████████▌| 73.0M/73.8M [00:09<00:00, 7.34MB/s]
100%|██████████████████████████████████████| 73.8M/73.8M [00:09<00:00, 7.96MB/s]


In [5]:
df = pd.read_csv('./2019-Dec.csv', dtype = str)
df['price'] = pd.to_numeric(df['price'])
df = df[df['price']>= 0]
df

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,f.o.x,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,cnd,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,runail,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,freedecor,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5
...,...,...,...,...,...,...,...,...,...
3533281,2019-12-31 23:59:39 UTC,view,5683350,1487580005671109489,,masura,2.84,536812729,e4a2d47c-a956-4c46-8176-745f52ea664b
3533282,2019-12-31 23:59:46 UTC,view,5888097,1487580013388628160,,shik,179.05,503658154,2dde9867-9e71-4a64-880d-aa68b66aae6d
3533283,2019-12-31 23:59:51 UTC,view,59975,1487580012096782476,,,7.14,595414541,4c6d80bb-5dd3-4fbb-b592-187b51db2753
3533284,2019-12-31 23:59:52 UTC,view,5775982,1783999063314661546,,,11.90,397780878,7e8a2b85-153a-44eb-a71f-b748fde14fcc


### Split data

In [6]:
data_train, data_test = train_test_split(df, test_size = 0.2, random_state = 0)

### Config

In [7]:
INPUT_CONFIG = {
	'user_id':'user_id',
	'product_id': 'product_id',
	'user_session':'user_session',
	'event': 'event_type',
	'order_actions': [{'source': 'view', 'des':'cart'},{'source': 'cart', 'des':'remove_from_cart'},{'source': 'cart', 'des':'purchase'}],
	'input_features': []
}

FE_CONFIG = []
MODEL_CONFIG=['NextAction_BinaryClass', 'NextAction_MultiClass']
PREDICT_CONFIG={'method':'gettop','ntop':None}
PIPELINE_CONFIG = {'task': 'converting_action_prediction', 'input_config':INPUT_CONFIG, 'fe_config':FE_CONFIG, 'model_config':MODEL_CONFIG, 'predict_config':PREDICT_CONFIG}

### Apply model

In [8]:
PIPELINE_CONFIG

{'task': 'converting_action_prediction',
 'input_config': {'user_id': 'user_id',
  'product_id': 'product_id',
  'user_session': 'user_session',
  'event': 'event_type',
  'order_actions': [{'source': 'view', 'des': 'cart'},
   {'source': 'cart', 'des': 'remove_from_cart'},
   {'source': 'cart', 'des': 'purchase'}],
  'input_features': []},
 'fe_config': [],
 'model_config': ['NextAction_BinaryClass', 'NextAction_MultiClass'],
 'predict_config': {'method': 'gettop', 'ntop': None}}

In [9]:
model = Ensemble_NextAction(PIPELINE_CONFIG['model_config'],PIPELINE_CONFIG['fe_config'],PIPELINE_CONFIG['input_config'])

NextAction_BinaryClass
NextAction_MultiClass


In [10]:
model.train(data_train)

...Training model:  NextAction_BinaryClass
Trained mode:  NextAction_BinaryClass
...Training model:  NextAction_MultiClass


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df[self.event] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df[self.event] = np.where(label_df[label] > 0, labels[label], label_df[self.event] )


Trained mode:  NextAction_MultiClass


In [11]:
model.predict(data_test)

[{'NextAction_BinaryClass': [{'view2cart': array([0, 0, 0, ..., 0, 0, 0])},
   {'cart2remove_from_cart': array([0, 0, 0, ..., 0, 0, 0])},
   {'cart2purchase': array([0, 0, 0, ..., 0, 0, 1])}]},
 {'NextAction_MultiClass': array([0, 0, 0, ..., 0, 0, 3])}]

In [12]:
model.get_probabilities(data_test)

[{'NextAction_BinaryClass': [{'view2cart': array([0.07358817, 0.05644277, 0.07304702, ..., 0.04485092, 0.12914056,
           0.07802456])},
   {'cart2remove_from_cart': array([0.06147493, 0.05374116, 0.0515187 , ..., 0.03374003, 0.03667009,
           0.04846276])},
   {'cart2purchase': array([0.02982722, 0.03414972, 0.03720878, ..., 0.02387373, 0.02387373,
           0.99684254])}]},
 {'NextAction_MultiClass': array([[0.69498775, 0.27894423, 0.02225973, 0.00380829],
         [0.81034441, 0.16579867, 0.02013145, 0.00372546],
         [0.67654311, 0.29257131, 0.02573288, 0.00515269],
         ...,
         [0.87136381, 0.11701361, 0.00902726, 0.00259531],
         [0.90737761, 0.08561549, 0.00590172, 0.00110518],
         [0.00904959, 0.00238071, 0.0020798 , 0.9864899 ]])}]

In [13]:
model.evaluate(data_test)

  mean1 = np.cumsum(hist * bin_centers) / weight1
  crit = np.log(((P1_sq[:-1] * P2_sq[1:]) ** -1) *
  (P1[:-1] * (1.0 - P1[:-1])) ** 2)
  recall = 1.0*n_tp/(n_tp+n_fn)
  mean1 = np.cumsum(hist * bin_centers) / weight1
  crit = np.log(((P1_sq[:-1] * P2_sq[1:]) ** -1) *
  (P1[:-1] * (1.0 - P1[:-1])) ** 2)
  recall = 1.0*n_tp/(n_tp+n_fn)
  mean1 = np.cumsum(hist * bin_centers) / weight1
  crit = np.log(((P1_sq[:-1] * P2_sq[1:]) ** -1) *
  (P1[:-1] * (1.0 - P1[:-1])) ** 2)
  recall = 1.0*n_tp/(n_tp+n_fn)
  recall = 1.0*n_tp/(n_tp+n_fn)


[{'NextAction_BinaryClass': [{'view2cart': {'model_performance': {'auc': 0.9451934552967809,
      'predicting': [{'method': 'threshold',
        'results': [{'threshold_method': 'constant',
          'results': {'accuracy': 0.9586547664471791,
           'precision': 0.4217642922818973,
           'recall': 0.4725201478519057,
           'trueneg_rate': 0.9806827713843164,
           'f1_score': 0.4457018747142204}},
         {'threshold_method': 'baseline',
          'results': {'accuracy': 0.965533559428977,
           'precision': 0.21109849099464548,
           'recall': 0.7114473204520598,
           'trueneg_rate': 0.996487171407395,
           'f1_score': 0.3255891553701773}},
         {'threshold_method': 'kmeans',
          'results': {'accuracy': 0.9468625499069541,
           'precision': 0.5557899291470604,
           'recall': 0.38071949909229,
           'trueneg_rate': 0.9629078143238193,
           'f1_score': 0.4518909410729991}},
         {'threshold_method': 'otsu',