In [1]:
#Import modules for Data Exploration
import pandas as pd
from rates import CTR #In src there is a rates.py where I have developed a function to quickly calculate CTR

## Data Exploration

In [3]:
df = pd.read_csv('../input/events_log.csv.gz', compression='gzip', error_bad_lines=False)
df.head()

Unnamed: 0,uuid,timestamp,session_id,group,action,checkin,page_id,n_results,result_position
0,00000736167c507e8ec225bd9e71f9e5,20160300000000.0,78245c2c3fba013a,b,searchResultPage,,cbeb66d1bc1f1bc2,5.0,
1,00000c69fe345268935463abbfa5d5b3,20160310000000.0,c559c3be98dca8a4,a,searchResultPage,,eb658e8722aad674,10.0,
2,00003bfdab715ee59077a3670331b787,20160300000000.0,760bf89817ce4b08,a,checkin,30.0,f99a9fc1f7fdd21e,,
3,0000465cd7c35ad2bdeafec953e08c1a,20160300000000.0,fb905603d31b2071,a,checkin,60.0,e5626962a6939a75,,10.0
4,000050cbb4ef5b42b16c4d2cf69e6358,20160300000000.0,c2bf5e5172a892dc,a,checkin,30.0,787dd6a4c371cbf9,,


In [4]:
df.shape

(400165, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400165 entries, 0 to 400164
Data columns (total 9 columns):
uuid               400165 non-null object
timestamp          400165 non-null float64
session_id         400165 non-null object
group              400165 non-null object
action             400165 non-null object
checkin            223824 non-null float64
page_id            400165 non-null object
n_results          136234 non-null float64
result_position    230482 non-null float64
dtypes: float64(4), object(5)
memory usage: 27.5+ MB


In [6]:
#Checking different sessions
df['session_id'].value_counts()

b254341e78af2f1a    484
593638dbb24d903a    305
549c7d24637bc68c    250
4264985570c2b41d    234
b1b9ad0b289027e1    223
                   ... 
8a0c68a4f445af85      1
020777bdfc251cd4      1
f34f50b40807b8f3      1
59d298aee940c999      1
a1bf552b4a1fbcd6      1
Name: session_id, Length: 68028, dtype: int64

In [7]:
#Checking different pages
df['page_id'].value_counts()

ffeae9d12ad83b25    24
5776ccc6144ff777    20
6ea77e97a39691d2    19
6151bd29e99dae15    19
e6fcfc833df5c687    19
                    ..
82e3ba8415a243a2     1
d4b4421037e41ed8     1
07293eb1d098ed7d     1
72ad6eeccb2f3ee0     1
0d68653f9c2a49dc     1
Name: page_id, Length: 176371, dtype: int64

## CTR calculated by Sessions Tagged as “visitPage”

Previous to developed a function, checking the number of clicks and web visits. Using CTR function where it's used the total number of clicks from sessions tagged as "visitPage" and the total amount
of sessions tracked in the dataset

In [8]:
#Calculate the number of clicks, considering each click as a register tagged with "visitPage"
print('The total amount of clicks is ', df[(df['action'] == 'visitPage')]['action'].count())

The total amount of clicks is  40107


In [9]:
#Calculate the number of page visits
print('The total amount of web visits is ', df.shape[0])

The total amount of web visits is  400165


In [10]:
#Checking there is no duplicated registers, therefore the total number of page visits could be considering all dataset registers
df.duplicated().sum()

0

In [11]:
#Calculate CTR using the function
ctr = CTR(df,'action', 'visitPage')
print('CTR based on the visitPage tag is up to ', ctr,'%')

CTR based on the visitPage tag is up to  10.02 %


### A and B groups CTR

Calculating CTR for each group. Grouping dataset based on a or b group and calculating CTR using same formula as previous to compare which group performs better.

In [12]:
#'A' group dataset
dfa = df[(df['group'] == 'a')]
print('CTR for A group is ', CTR(dfa, 'action', 'visitPage'), '%')

CTR for A group is  11.12 %


In [13]:
#'B' group dataset
dfb = df[(df['group'] == 'b')]
print('CTR for B group is ', CTR(dfb, 'action', 'visitPage'), '%')

CTR for B group is  6.76 %
