### Analyze client churn page using logdata

In [1]:
import pandas as pd
import numpy as np

web server log data
- ip, time, visited page
- the log data can be changed by format
- log data is being used for web server debugging, data analysis
- format in this project
    - ip, sessionID, userUniqueId, time, request, page, statuscode, bytesize
    - ex. 1.0.0.1 sessionid user59 [16/Dec/2019:02:00:08] GET /checkout 200 1509

In [2]:
logs = pd.read_csv('web.log',
                   sep='\s',
                   engine='python',
                   names=['ip', 'session_id', 'user_id', 'datetime', 'request', 'page', 'status', 'byte_size']
                  )
logs.head()

Unnamed: 0,ip,session_id,user_id,datetime,request,page,status,byte_size
0,4.5.4.5,69de169f-6eed-4e4d-ae5b-ff997b8c889f,user89,[01/Dec/2019T00:47:11],GET,/product_list,200,2107
1,4.5.4.5,69de169f-6eed-4e4d-ae5b-ff997b8c889f,user89,[01/Dec/2019T00:51:21],GET,/product_detail,200,1323
2,3.3.3.3.,3d46aad9-17eb-4af1-bc54-6ca91d7f8f6c,user2,[01/Dec/2019T00:51:43],GET,/product_list,200,2616
3,1.0.1.0,57623182-b78b-4bdc-b977-a2b34612c6d1,user45,[01/Dec/2019T01:04:02],GET,/product_list,200,2303
4,3.3.3.3.,3d46aad9-17eb-4af1-bc54-6ca91d7f8f6c,user2,[01/Dec/2019T01:12:28],GET,/product_detail,200,1830


In [3]:
logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1290 entries, 0 to 1289
Data columns (total 8 columns):
ip            1290 non-null object
session_id    1290 non-null object
user_id       1290 non-null object
datetime      1290 non-null object
request       1290 non-null object
page          1290 non-null object
status        1290 non-null int64
byte_size     1290 non-null int64
dtypes: int64(2), object(6)
memory usage: 80.8+ KB


In [4]:
logs.describe() # not useful since interested in only strings

Unnamed: 0,status,byte_size
count,1290.0,1290.0
mean,200.0,2010.652713
std,0.0,461.567919
min,200.0,1200.0
25%,200.0,1615.75
50%,200.0,2018.0
75%,200.0,2408.75
max,200.0,2798.0


### transfer date format

In [5]:
# 01/Dec/2019T00:47:11

logs['datetime'] = logs['datetime'].apply(lambda date: date.replace('[', '').replace(']', ''))
logs['datetime'] = pd.to_datetime(logs['datetime'], format='%d/%b/%YT%H:%M:%S') 

logs.head()

Unnamed: 0,ip,session_id,user_id,datetime,request,page,status,byte_size
0,4.5.4.5,69de169f-6eed-4e4d-ae5b-ff997b8c889f,user89,2019-12-01 00:47:11,GET,/product_list,200,2107
1,4.5.4.5,69de169f-6eed-4e4d-ae5b-ff997b8c889f,user89,2019-12-01 00:51:21,GET,/product_detail,200,1323
2,3.3.3.3.,3d46aad9-17eb-4af1-bc54-6ca91d7f8f6c,user2,2019-12-01 00:51:43,GET,/product_list,200,2616
3,1.0.1.0,57623182-b78b-4bdc-b977-a2b34612c6d1,user45,2019-12-01 01:04:02,GET,/product_list,200,2303
4,3.3.3.3.,3d46aad9-17eb-4af1-bc54-6ca91d7f8f6c,user2,2019-12-01 01:12:28,GET,/product_detail,200,1830


In [6]:
logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1290 entries, 0 to 1289
Data columns (total 8 columns):
ip            1290 non-null object
session_id    1290 non-null object
user_id       1290 non-null object
datetime      1290 non-null datetime64[ns]
request       1290 non-null object
page          1290 non-null object
status        1290 non-null int64
byte_size     1290 non-null int64
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 80.8+ KB


### From which pages do customers exit?
- If we know the client churn page, we can analyze the page and induce the clients the purchase page
- Most likely when the next page has higher barrier such as credit cards, entering information, etc)

In [7]:
logs.head()

Unnamed: 0,ip,session_id,user_id,datetime,request,page,status,byte_size
0,4.5.4.5,69de169f-6eed-4e4d-ae5b-ff997b8c889f,user89,2019-12-01 00:47:11,GET,/product_list,200,2107
1,4.5.4.5,69de169f-6eed-4e4d-ae5b-ff997b8c889f,user89,2019-12-01 00:51:21,GET,/product_detail,200,1323
2,3.3.3.3.,3d46aad9-17eb-4af1-bc54-6ca91d7f8f6c,user2,2019-12-01 00:51:43,GET,/product_list,200,2616
3,1.0.1.0,57623182-b78b-4bdc-b977-a2b34612c6d1,user45,2019-12-01 01:04:02,GET,/product_list,200,2303
4,3.3.3.3.,3d46aad9-17eb-4af1-bc54-6ca91d7f8f6c,user2,2019-12-01 01:12:28,GET,/product_detail,200,1830


In [8]:
logs['page'].unique() # look for what kinds of pages exist

array(['/product_list', '/product_detail', '/cart', '/checkout',
       '/order_complete'], dtype=object)

#### transform to the format of session_id, product_list, product_detail, cart, order_coplete



### Funnel analysis, create dataframe
- for listing the order of pages

In [9]:
funnel_dict = {
    '/product_list': 1,
    '/product_detail': 2,
    '/cart': 3,
    '/checkout': 4,
    '/order_complete': 5,
}

funnel_steps = pd.DataFrame.from_dict(funnel_dict, orient='index', columns=['step_no'])
funnel_steps

Unnamed: 0,step_no
/product_list,1
/product_detail,2
/cart,3
/checkout,4
/order_complete,5


#### session, page grouping
- Need to take an account of different user if a user log-in w/ different session
- Extract based on group of (session_id and page) from earlier times

In [26]:
grouped = logs.groupby(['session_id', 'page'])['datetime'].agg(np.min)
grouped = pd.DataFrame(grouped).merge(funnel_steps, left_on='page', right_index=True)

grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,datetime,step_no
session_id,page,Unnamed: 2_level_1,Unnamed: 3_level_1
000d99d8-d2d4-4e9a-bb06-69b1ae6442d9,/product_detail,2019-12-01 12:06:39,2
0155049d-32e7-44de-9b0d-4c02f63d6099,/product_detail,2019-12-04 00:22:44,2
020d4536-1341-4de1-87d3-e22ba8611af6,/product_detail,2019-12-19 06:25:48,2
0381411a-78d8-4c27-9622-3210b7ed62d6,/product_detail,2019-12-05 05:09:32,2
06268108-6228-4237-ac1d-7927dd44273d,/product_detail,2019-12-11 04:17:31,2
