In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.templates.default = "plotly_white"

In [3]:
data = pd.read_csv("bounce-rate.csv")
print(data.head())

      Client ID  Sessions Avg. Session Duration Bounce Rate
0  5.778476e+08       367              00:01:35      87.19%
1  1.583822e+09       260              00:01:04      29.62%
2  1.030699e+09       237              00:00:02      99.16%
3  1.025030e+09       226              00:02:22      25.66%
4  1.469968e+09       216              00:01:23      46.76%


In [4]:
print(data.isnull().sum())

Client ID                0
Sessions                 0
Avg. Session Duration    0
Bounce Rate              0
dtype: int64


In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Client ID              999 non-null    float64
 1   Sessions               999 non-null    int64  
 2   Avg. Session Duration  999 non-null    object 
 3   Bounce Rate            999 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 31.3+ KB
None


In [6]:
data['Avg. Session Duration'] = data['Avg. Session Duration'].str[1:]
data['Avg. Session Duration'] = pd.to_timedelta(data['Avg. Session Duration'])
data['Avg. Session Duration'] = data['Avg. Session Duration'] / pd.Timedelta(minutes=1)
data['Bounce Rate'] = data['Bounce Rate'].str.rstrip('%').astype('float')
print(data)

        Client ID  Sessions  Avg. Session Duration  Bounce Rate
0    5.778476e+08       367               1.583333        87.19
1    1.583822e+09       260               1.066667        29.62
2    1.030699e+09       237               0.033333        99.16
3    1.025030e+09       226               2.366667        25.66
4    1.469968e+09       216               1.383333        46.76
..            ...       ...                    ...          ...
994  1.049263e+09        17               7.733333        41.18
995  1.145806e+09        17               5.616667        47.06
996  1.153811e+09        17               0.200000        94.12
997  1.182133e+09        17               1.216667        88.24
998  1.184187e+09        17               2.566667        64.71

[999 rows x 4 columns]


In [7]:
print(data.describe())

          Client ID    Sessions  Avg. Session Duration  Bounce Rate
count  9.990000e+02  999.000000             999.000000   999.000000
mean   1.036401e+09   32.259259               3.636520    65.307978
std    6.151503e+08   24.658588               4.040562    22.997270
min    1.849182e+05   17.000000               0.000000     4.880000
25%    4.801824e+08   21.000000               0.891667    47.370000
50%    1.029507e+09   25.000000               2.466667    66.670000
75%    1.587982e+09   35.000000               4.816667    85.190000
max    2.063338e+09  367.000000              30.666667   100.000000


In [8]:
dataWithoutCid = data.drop('Client ID', axis=1)

In [20]:
correlationMatrix = dataWithoutCid.corr()
correlationFig = px.imshow(correlationMatrix,text_auto = True,color_continuous_scale = 'Temps')
correlationFig.update_layout(title='Correlation Matrix')
correlationFig.show()

In [27]:
highBounceRate = 70
lowBounceRate = 30
data['Bounce Rate Segment'] = pd.cut(data['Bounce Rate'], 
                                     bins=[0, lowBounceRate, 
                                           highBounceRate, 100],
                                   labels=['Low', 'Medium', 'High'], right=False)
segmentCounts = data['Bounce Rate Segment'].value_counts().sort_index()
segmentFig = px.bar(segmentCounts, labels={'index': 'Bounce Rate Segment', 
                                             'value': 'Number of Clients'},
                     title='Segmentation of Clients based on Bounce Rates',
                     color = 'value', color_continuous_scale = 'Temps',text_auto = True)
segmentFig.update_layout(title_text='Segmentation of Clients based on Bounce Rates', title_x=0.5)
segmentFig.show()

In [35]:
segmentAvgDuration = data.groupby('Bounce Rate Segment')['Avg. Session Duration'].mean()
engagementFig = go.Figure(data=go.Bar(
    x=segmentAvgDuration.index,
    y=segmentAvgDuration,
    text=segmentAvgDuration.round(2),
    textposition='auto',
    marker=dict(color=['#7FD4C1', '#F7C0BB', '#FF4136'],
    )
))
engagementFig.update_layout(
    title='Comparison of User Engagement by Bounce Rate Segment',title_x=0.5,
    xaxis=dict(title='Bounce Rate Segment'),
    yaxis=dict(title='Average Session Duration (minutes)'),
)

engagementFig.show()

In [36]:
data['Total Session Duration'] = data['Sessions'] * data['Avg. Session Duration']
dataSorted = data.sort_values('Total Session Duration', ascending=False)
dataSorted.head(10)

Unnamed: 0,Client ID,Sessions,Avg. Session Duration,Bounce Rate,Bounce Rate Segment,Total Session Duration
20,1884620000.0,93,30.666667,16.13,Low,2852.0
54,1041722000.0,67,20.5,22.39,Low,1373.5
262,875655700.0,34,29.966667,26.47,Low,1018.866667
10,1461865000.0,117,8.45,48.72,Medium,988.65
173,184918.2,40,24.416667,17.5,Low,976.666667
15,1049234000.0,99,9.716667,34.34,Medium,961.95
310,2026953000.0,31,22.116667,35.48,Medium,685.616667
24,1903206000.0,90,7.016667,36.67,Medium,631.5
211,2054569000.0,37,16.25,35.14,Medium,601.25
402,622093500.0,28,21.3,39.29,Medium,596.4


In [39]:
scatterFig = px.scatter(data, x='Bounce Rate', y='Avg. Session Duration',
                         title='Relationship between Bounce Rate and Avg. Session Duration', trendline='ols')

scatterFig.update_layout(
    xaxis=dict(title='Bounce Rate'),
    yaxis=dict(title='Avg. Session Duration')
)

scatterFig.show()

In [42]:
def getRetentionSegment(row):
    if row['Sessions'] >= 32:
        return 'Frequent Users'
    else:
        return 'Occasional Users'

data['Retention Segment'] = data.apply(getRetentionSegment, axis=1)
print(data)

        Client ID  Sessions  Avg. Session Duration  Bounce Rate  \
0    5.778476e+08       367               1.583333        87.19   
1    1.583822e+09       260               1.066667        29.62   
2    1.030699e+09       237               0.033333        99.16   
3    1.025030e+09       226               2.366667        25.66   
4    1.469968e+09       216               1.383333        46.76   
..            ...       ...                    ...          ...   
994  1.049263e+09        17               7.733333        41.18   
995  1.145806e+09        17               5.616667        47.06   
996  1.153811e+09        17               0.200000        94.12   
997  1.182133e+09        17               1.216667        88.24   
998  1.184187e+09        17               2.566667        64.71   

    Bounce Rate Segment  Total Session Duration Retention Segment  
0                  High              581.083333    Frequent Users  
1                   Low              277.333333    Frequent

In [53]:
segmentBounceRates = data.groupby('Retention Segment')['Bounce Rate'].mean().reset_index()

barFig = px.bar(segmentBounceRates, x='Retention Segment', y='Bounce Rate',
                 title='Average Bounce Rate by Retention Segment',
                 text_auto = True,
                 labels={'Retention Segment': 'Retention Segment', 'Bounce Rate': 'Average Bounce Rate'})
bar_fig.show()

In [54]:
segmentCounts = data['Retention Segment'].value_counts()
colors = ['#7FD4C1', '#F7C0BB']

fig = px.pie(segment_counts, 
             values=segmentCounts.values, 
             names=segmentCounts.index, 
             color=segmentCounts.index, 
             color_discrete_sequence=colors,
             title='User Retention Rate')

# Update layout and show the chart
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()