In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("data-export (1).csv")

In [None]:
df.head()

In [None]:
df.columns = df.iloc[0]
df = df.drop(index=0).reset_index(drop = True)
df.columns = ["channel group","DateHour","Users","Sessions","Engaged Sessions","Average engagement time per session","Engaged sessions per user","Events per session","Engagement rate","Event count"]            
df.head()

In [None]:
df.info()

In [None]:
df["DateHour"] = pd.to_datetime(df["DateHour"], format="%Y%m%d%H", errors='coerce')


In [None]:
df.info()

In [None]:
df.head()

In [None]:
numeric_cols = df.columns.drop(["channel group", "DateHour"])
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df["Hour"] = df["DateHour"].dt.hour

## Final Data for analysis after cleaning.

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## What patterns or trends can you observe in website sessions and users over time?

In [None]:
sns.set(style="whitegrid")

In [None]:
plt.figure(figsize=(10,5))
df.groupby("DateHour")[["Sessions","Users"]].sum().plot(ax=plt.gca())
plt.show()

## Which marketing channel brought the highest number of users to the website, 
## and how can we use this insight to improve traffic from  other sources?

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=df, x="channel group", y="Users", estimator=np.sum, palette="viridis")
plt.title("Total Users by Channel")
plt.xticks(rotation=45)
plt.show()

## Which channel has the highest average engagement time, 
## and what does that tell us about users behaviour and content effectiveness?

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=df, x="channel group", y="Average engagement time per session", estimator=np.mean, palette="magma")
plt.title("Average Engagement Time by Channel")
plt.xticks(rotation=45)
plt.show()

## How does engagement rate vary across different traffic channels?

In [None]:
df.head()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="channel group", y="Engagement rate", palette="coolwarm")
plt.title("Engagement Rate Distribution by Channel")
plt.xticks(rotation=45)
plt.show()

## Which channel are driving more engaged sessions compared to non-engaged ones,
## and what strategies can improve engagement in underperforming channels?

In [None]:
session_df = df.groupby("channel group")[["Sessions", "Engaged Sessions"]].sum().reset_index()
session_df["Non-Engaged"] = session_df["Sessions"] - session_df["Engaged Sessions"]
session_df_melted = session_df.melt(id_vars="channel group", value_vars= ["Engaged Sessions", "Non-Engaged"])

plt.figure(figsize=(8, 5))
sns.barplot(data=session_df_melted, x="channel group", y="value", hue="variable")
plt.title("Engaged vs Non-Engaged Sessions")
plt.xticks(rotation=45)
plt.show()

## At what hour of the day does each channel drive the most traffic?

In [None]:
heatmap_data = df.groupby(["Hour","channel group"])["Sessions"].sum().unstack().fillna(0)

plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="YlGnBu", linewidths = .5, annot = True,fmt='.0f')
plt.title("Engaged vs Non-Engaged Sessions")
plt.xlabel("Channel Group")
plt.ylabel("Hour of Day")
plt.show()


## Is there any correlation between high traffic (sessions) and high engagement rate over time?

In [None]:
df_plot = df.groupby("DateHour")[["Engagement rate","Sessions"]].mean().reset_index()

plt.figure(figsize=(10,5))
plt.plot(df_plot["DateHour"], df_plot["Engagement rate"],label="Engagement Rate", color="purple")
plt.plot(df_plot["DateHour"], df_plot["Sessions"],label="Sessions", color= "green")
plt.title("Engagement Rate vs Session Over Time")
plt.xlabel("DateHour")
plt.legend()
plt.grid(True)
plt.show()