# Case Study EDA

### first steps

create dataframe, standardize columns and values


In [5]:
import pandas as pd
import datetime as dt

df = pd.read_csv('website_traffic_data.csv')
df.columns = [c.lower().strip().replace(' ','_').replace('-','_') for c in df.columns]
df['date'] = pd.to_datetime(df['date'], errors='coerce')
for col in ['bounce_rate','conversion_rate']:
    if df[col].dropna().between(1,100).mean() > 0.9: df[col] = df[col]/100

print(df.head())
print(df.columns)
print(df.dtypes)
print(df.isna().sum())

        date  sessions  bounce_rate  conversion_rate traffic_source  \
0 2025-04-01     246.0     0.564769         0.080461    Paid Search   
1 2025-04-02     474.0     0.657921         0.065349       Referral   
2 2025-04-03     524.0     0.453658         0.040685   Social Media   
3 2025-04-04     487.0     0.327508         0.038754    Paid Search   
4 2025-04-05     507.0     0.409198         0.021754       Referral   

      campaign  
0  Spring Sale  
1  Spring Sale  
2  Spring Sale  
3  Spring Sale  
4  Spring Sale  
Index(['date', 'sessions', 'bounce_rate', 'conversion_rate', 'traffic_source',
       'campaign'],
      dtype='object')
date               datetime64[ns]
sessions                  float64
bounce_rate               float64
conversion_rate           float64
traffic_source             object
campaign                   object
dtype: object
date               14
sessions           14
bounce_rate        14
conversion_rate    14
traffic_source     14
campaign           14


### feature engineering

In [6]:
df['conversions'] = (pd.to_numeric(df['sessions'], errors='coerce') *
                     pd.to_numeric(df['conversion_rate'], errors='coerce'))
df['week'] = df['date'].dt.to_period('W').apply(lambda p: p.start_time)
df['month'] = df['date'].dt.to_period('M').astype(str)
df['campaign_flag'] = (~df['campaign'].isna()) & (df['campaign'].str.lower().str.strip().ne("(none)")) & (df['campaign'].str.strip().ne(""))
df['is_weekend'] = df['date'].dt.weekday >= 5

AttributeError: 'NaTType' object has no attribute 'start_time'

### get basic metrics to keep track of
calculate rolling means and sums

In [None]:
daily = (df.dropna(subset=['date'])
        .groupby('date', as_index=False)
        .agg(session=('sessions', 'sum'),
        conversions=('conversions', 'sum'),
        bounce_rate=('bounce_rate', 'mean'),
        conversion_rate=('conversion_rate', 'mean'))
        .sort_values('date'))

daily['session_7d'] = daily['sessions'].rolling(7, min_periods=3).mean()
daily['bounce_rate_7d'] = daily['bounce_rate'].rolling(7, min_periods=3).mean()
daily['conversion_rate_7d'] = daily['conversion_rate'].rolling(7, min_periods=3).mean()

daily.head()

### source scorecard

In [None]:
by_src = (df.groupby('traffic_source', dropna=False)
            .agg(sessions=("sessions", "sum"),
                 conversions=("conversions", "sum"),
                 bounce_rate=("bounce_rate", "mean"),
                 conversion_rate=("conversion_rate", "mean"))
            .reset_index())

by_scr['traffic_share'] = by_src['sessions'] / by_src['sessions'].sum()
by_src.sort_values(['conversion_rate', 'sessions'], ascending=[False, False], inplace=True)
by_src

### campaign scorecard

In [None]:
by_campaign = (df.groupby('campaign', dropna=False)
            .agg(sessions=("sessions", "sum"),
                 conversions=("conversions", "sum"),
                 bounce_rate=("bounce_rate", "mean"),
                 conversion_rate=("conversion_rate", "mean"))
            .reset_index()
            .sort_values(['conversions', 'conversion_rate'], ascending=[False, False]))

by_campaign