# Funnel Analysis

In [1]:
# set-up stage

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Corporate Styling

In [2]:
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("Everything is fine")

Everything is fine


### Loading Dataset

In [3]:
df = pd.read_csv("funnel_dataset.csv")
df

Unnamed: 0,User_ID,Session_ID,Event,Timestamp,Device,Region,Channel,Product_Category,Revenue,Bounce_Flag
0,USR00001,SES00001,Browse,28-10-2025 07:33,Desktop,West,Organic,Home,0.00,Yes
1,USR00001,SES00001,Add to Cart,28-10-2025 07:36,Tablet,East,Social Media,Beauty,0.00,Yes
2,USR00001,SES00001,Checkout,28-10-2025 07:40,Mobile,West,Email,Beauty,0.00,Yes
3,USR00002,SES00002,Browse,19-10-2025 09:15,Desktop,East,Email,Electronics,0.00,No
4,USR00002,SES00002,Add to Cart,19-10-2025 09:18,Mobile,West,Social Media,Fashion,0.00,No
...,...,...,...,...,...,...,...,...,...,...
21658,USR09999,SES09999,Browse,03-10-2025 17:22,Desktop,South,Google Ads,Electronics,0.00,No
21659,USR09999,SES09999,Add to Cart,03-10-2025 17:26,Tablet,West,Social Media,Fashion,0.00,No
21660,USR09999,SES09999,Checkout,03-10-2025 17:29,Mobile,North,Email,Fashion,0.00,No
21661,USR09999,SES09999,Purchase,03-10-2025 17:34,Mobile,East,Email,Fashion,960.65,No


### Basic Steps

In [4]:
print("Top 5 rows\n")
df.head()

Top 5 rows



Unnamed: 0,User_ID,Session_ID,Event,Timestamp,Device,Region,Channel,Product_Category,Revenue,Bounce_Flag
0,USR00001,SES00001,Browse,28-10-2025 07:33,Desktop,West,Organic,Home,0.0,Yes
1,USR00001,SES00001,Add to Cart,28-10-2025 07:36,Tablet,East,Social Media,Beauty,0.0,Yes
2,USR00001,SES00001,Checkout,28-10-2025 07:40,Mobile,West,Email,Beauty,0.0,Yes
3,USR00002,SES00002,Browse,19-10-2025 09:15,Desktop,East,Email,Electronics,0.0,No
4,USR00002,SES00002,Add to Cart,19-10-2025 09:18,Mobile,West,Social Media,Fashion,0.0,No


In [5]:
print("last 5 rows\n")
df.tail()

last 5 rows



Unnamed: 0,User_ID,Session_ID,Event,Timestamp,Device,Region,Channel,Product_Category,Revenue,Bounce_Flag
21658,USR09999,SES09999,Browse,03-10-2025 17:22,Desktop,South,Google Ads,Electronics,0.0,No
21659,USR09999,SES09999,Add to Cart,03-10-2025 17:26,Tablet,West,Social Media,Fashion,0.0,No
21660,USR09999,SES09999,Checkout,03-10-2025 17:29,Mobile,North,Email,Fashion,0.0,No
21661,USR09999,SES09999,Purchase,03-10-2025 17:34,Mobile,East,Email,Fashion,960.65,No
21662,USR10000,SES10000,Browse,15-10-2025 20:32,Mobile,South,Google Ads,Beauty,0.0,Yes


In [6]:
print("Randomly selected row\n")
df.sample()

Randomly selected row



Unnamed: 0,User_ID,Session_ID,Event,Timestamp,Device,Region,Channel,Product_Category,Revenue,Bounce_Flag
21636,USR09987,SES09987,Browse,20-10-2025 01:44,Tablet,East,Organic,Sports,0.0,Yes


In [7]:
# All column names, total number of columns

print(df.columns)
print(f"\nTotal number of columns: {df.columns.nunique()}")

Index(['User_ID', 'Session_ID', 'Event', 'Timestamp', 'Device', 'Region',
       'Channel', 'Product_Category', 'Revenue', 'Bounce_Flag'],
      dtype='str')

Total number of columns: 10


In [8]:
# type of dataset

type(df)

pandas.DataFrame

In [9]:
# data type of all columns

df.dtypes

User_ID                 str
Session_ID              str
Event                   str
Timestamp               str
Device                  str
Region                  str
Channel                 str
Product_Category        str
Revenue             float64
Bounce_Flag             str
dtype: object

In [10]:
# timestamp column must be in date time format

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(df.dtypes)

User_ID                        str
Session_ID                     str
Event                          str
Timestamp           datetime64[us]
Device                         str
Region                         str
Channel                        str
Product_Category               str
Revenue                    float64
Bounce_Flag                    str
dtype: object


  df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [11]:
# info about all the columns

df.info()

<class 'pandas.DataFrame'>
RangeIndex: 21663 entries, 0 to 21662
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   User_ID           21663 non-null  str           
 1   Session_ID        21663 non-null  str           
 2   Event             21663 non-null  str           
 3   Timestamp         21663 non-null  datetime64[us]
 4   Device            21663 non-null  str           
 5   Region            21663 non-null  str           
 6   Channel           21663 non-null  str           
 7   Product_Category  21663 non-null  str           
 8   Revenue           21663 non-null  float64       
 9   Bounce_Flag       21663 non-null  str           
dtypes: datetime64[us](1), float64(1), str(8)
memory usage: 1.7 MB


In [12]:
# imp information about dataset's numeric columns

df.describe()

Unnamed: 0,Timestamp,Revenue
count,21663,21663.0
mean,2025-10-16 19:55:53.300096,54.304841
min,2025-10-01 19:30:00,0.0
25%,2025-10-09 08:31:00,0.0
50%,2025-10-16 19:47:00,0.0
75%,2025-10-24 08:12:30,0.0
max,2025-10-31 19:25:00,1998.51
std,,262.692471


In [13]:
# imp info about all the columns

df.describe(include='object')

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  df.describe(include='object')


Unnamed: 0,User_ID,Session_ID,Event,Device,Region,Channel,Product_Category,Bounce_Flag
count,21663,21663,21663,21663,21663,21663,21663,21663
unique,10000,10000,4,3,4,4,5,2
top,USR00002,SES00002,Browse,Tablet,West,Google Ads,Electronics,Yes
freq,4,4,10000,7237,5462,5435,4405,17343


In [14]:
# shape of dataset

df.shape

(21663, 10)

In [15]:
pd.set_option("display.max_rows", None)

### Data Pre-processing and Cleaning

In [16]:
# check for null and duplicate values

print("\n---Null Values---\n")
null_values = df.isnull().sum()
print(null_values)

print("\n---Duplicate Values---\n")
duplicate_values = df.duplicated().sum()
print(f"Total numbers of duplicate values in this dataset is {duplicate_values}")

print("\n---Unique Data---\n")
unique_data = df.nunique()
print(unique_data)


---Null Values---

User_ID             0
Session_ID          0
Event               0
Timestamp           0
Device              0
Region              0
Channel             0
Product_Category    0
Revenue             0
Bounce_Flag         0
dtype: int64

---Duplicate Values---



Total numbers of duplicate values in this dataset is 0

---Unique Data---

User_ID             10000
Session_ID          10000
Event                   4
Timestamp           16989
Device                  3
Region                  4
Channel                 4
Product_Category        5
Revenue              1078
Bounce_Flag             2
dtype: int64


In [17]:
# fixing the data and extracting time based data
# Creating multiple values for deep analysis

df['Date'] = df['Timestamp'].dt.date
df['DayOfWeek'] = df['Timestamp'].dt.day_name()
df['Hour'] = df['Timestamp'].dt.hour
df['WeekNumber'] = df['Timestamp'].dt.isocalendar().week