In [None]:
#User defined functions
def return_nth_item(string,n):
    try:
        return str(string).split('.')[n]
    except IndexError: 
        return 'No_value'
def return_weekday_numb(input_value):
    try:
        return input_value.weekday()
    except IndexError: 
        return -99
def return_hour(input_value):
    try:
        return input_value.hour
    except IndexError: 
        return -99    
def top_brand(input_data):
    famous_brand=input_data['brand'].value_counts().nlargest(1)
    return famous_brand

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
dataset=pd.read_csv("/kaggle/input/ecommerce-behavior-data-from-multi-category-store/2019-Nov.csv",nrows=10000)

1-EDA and Data Engineering

In this step, I am doing the health check of features and creating new features.

In [None]:
#Any missing value
dataset.isnull().sum()

In [None]:
#feature: Event Time

# adding new feature: event day
dataset['event_time']=pd.to_datetime(dataset['event_time']).dt.tz_convert(None)
dataset['event_time_week_day_number']=dataset['event_time'].apply(return_weekday_numb)

# adding new feature:event hour
dataset['event_time_hour']=dataset['event_time'].apply(return_hour)

#adding new feature:time spent in this step
dataset['next_action_time']=dataset.sort_values(by=['user_session','event_time']).groupby(by=['user_session'])['event_time'].shift(-1)
dataset['time_spent_in_this_step_seconds']=dataset['next_action_time']-dataset['event_time']
dataset['time_spent_in_this_step_seconds']=dataset['time_spent_in_this_step_seconds'].dt.seconds.astype(int,errors='ignore')
dataset.drop(columns=['next_action_time'], inplace=True)

In [None]:
#feature: Event Type

#creating dummy features
dataset=pd.get_dummies(data=dataset,columns=['event_type'])

In [None]:
#feature: Product ID
dataset['product_id'].nunique()

In [None]:
#features: category id & category code

#replacing null values in category code feature with other
dataset['category_code'].fillna('Unknown_category', inplace=True)

#adding new features: spliting category feature into subgroups
max_category_number=0
for item in dataset['category_code'].unique():
    if str(item).count('.')>max_category_number:
        max_category_number=item.count('.') 
for cat in np.arange(max_category_number+1):
    new_col_name='category_'+str(cat)
    dataset[new_col_name]=dataset.apply(lambda x: return_nth_item(string=x['category_code'], n=cat), axis=1)
dataset.drop(columns=['category_code'],inplace=True)

In [None]:
#feature: Brand
#number of brands in each category
dataset.groupby(by='category_0')['brand'].size()

In [None]:
#feature: price
#price distribution in each category
plt.figure(figsize=(12,8))
sns.boxplot(x='price',y='category_0',data=dataset);

2-Creating session and customer tables

In this step, I am creating a summary table of sessions.

In [None]:
session_table=dataset.groupby(by=['user_session','user_id']).agg(session_beginning=('event_time',min),
                                                                 session_ending=('event_time',max),
                                                                 number_of_pages=('event_time','count'),
                                                                 number_of_products=('product_id',lambda x: x.nunique()),
                                                                 number_of_categories=('category_id',lambda x: x.nunique()),
                                                                 number_of_brands=('brand',lambda x: x.nunique()),
                                                                 average_price_of_products=('price','mean'),
                                                                 std_price_of_products=('price','std'),
                                                                 min_price_of_products=('price',min),
                                                                 max_price_of_products=('price',max),
                                                                 event_time_week_day_number=('event_time_week_day_number',min),
                                                                 event_time_hour=('event_time_hour',min),
                                                                 average_time_spent_on_each_step_in_this_session=('time_spent_in_this_step_seconds','mean'),
                                                                 max_time_spent_on_each_step_in_this_session=('time_spent_in_this_step_seconds',max),
                                                                 min_time_spent_on_each_step_in_this_session=('time_spent_in_this_step_seconds',min),
                                                                 std_time_spent_on_each_step_in_this_session=('time_spent_in_this_step_seconds','std') , 
                                                                 total_item_added_in_the_shopping_cart_in_this_session=('event_type_cart',sum),
                                                                 total_item_purchased_in_this_session=('event_type_purchase',sum),
                                                                 total_item_viewed_in_this_session=('event_type_view',sum)#,
                                                                 #total_item_removedfromcart_in_this_session=('event_type_removedfromcart',sum),
                                                                 
                                                                )
                                                                 
session_table['session_duration']=session_table['session_ending']-session_table['session_beginning']
session_table['price_cv']=session_table['average_price_of_products']/session_table['std_price_of_products'] #more focus on budget
session_table['time_spent_on_each_step_in_this_session_cv']= session_table['average_time_spent_on_each_step_in_this_session']/session_table['std_time_spent_on_each_step_in_this_session']# this will show, are there any product that the visitor interested

In [None]:
customer_table=session_table.reset_index().groupby(by=['user_id']).agg(
    number_of_session=('user_session',lambda x: x.nunique()),
    total_item_purchased=('total_item_purchased_in_this_session',sum)
)

**Q&A**

In [None]:
#1-Page views: 

#Which brand is the most famous in each category ?
print('Which brand is the most famous in each category ?')
print('-'*50)
print(dataset.groupby(by=['category_0','category_1','category_2','category_3']).apply(top_brand))

#What is the number of visited pages/products in each session on average ?
print('\n\nHow many pages are visited in each session on average ?')
print('-'*50)
print(session_table['number_of_pages'].describe())

plt.figure(figsize=(12,6)) #plotting the distribution
sns.countplot(x='number_of_pages',data=session_table)
plt.title('Number of Pages Viewed Distribution');

In [None]:
#2-Repeated users:

#How many users,who have visited your website more than once, do you have?
users_visited_more_than_one=customer_table.loc[customer_table['number_of_session']>1]
print('How many users,who have visited your website more than once, do you have?')
print('-'*50)
print('There are {} people,who visits your website more than once.'.format(users_visited_more_than_one.shape[0]))

#How many users, who have purhased before, do you have?
users_purchased_more_than_one=customer_table.loc[customer_table['total_item_purchased']>1]
print('\n\nHow many users,who have purchased more than once, do you have?')
print('-'*50)
print('There are {} people,who purchased more than once.'.format(users_purchased_more_than_one.shape[0]))

figure=plt.figure(figsize=(14,4))
fig_1=figure.add_subplot(1,2,1)
sns.countplot(x='number_of_session',data=users_visited_more_than_one,ax=fig_1)
plt.title('Number of visits')
plt.ylabel('Number of users')
plt.xlabel('Number of times')

fig_2=figure.add_subplot(1,2,2)
sns.countplot(x='total_item_purchased',data=users_purchased_more_than_one,ax=fig_2)
plt.title('Number of purchase')
plt.ylabel('Number of users')
plt.xlabel('Number of times');


In [None]:
#3-Conversation rates:
