In [1]:
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql import Row

spark = SparkSession.builder \
    .appName("Web Traffic Analysis") \
    .getOrCreate()
spark

web_data = [
Row(UserID=1, Page="Home", Timestamp="2024-04-10 10:00:00", Duration=35,
Device="Mobile", Country="India"),
Row(UserID=2, Page="Products", Timestamp="2024-04-10 10:02:00", Duration=120,
Device="Desktop", Country="USA"),
Row(UserID=3, Page="Cart", Timestamp="2024-04-10 10:05:00", Duration=45,
Device="Tablet", Country="UK"),
Row(UserID=1, Page="Checkout", Timestamp="2024-04-10 10:08:00", Duration=60,
Device="Mobile", Country="India"),
Row(UserID=4, Page="Home", Timestamp="2024-04-10 10:10:00", Duration=15,
Device="Mobile", Country="Canada"),
Row(UserID=2, Page="Contact", Timestamp="2024-04-10 10:15:00", Duration=25,
Device="Desktop", Country="USA"),
Row(UserID=5, Page="Products", Timestamp="2024-04-10 10:20:00", Duration=90,
Device="Desktop", Country="India"),
]
df_web = spark.createDataFrame(web_data)
df_web.show(truncate=False)

+------+--------+-------------------+--------+-------+-------+
|UserID|Page    |Timestamp          |Duration|Device |Country|
+------+--------+-------------------+--------+-------+-------+
|1     |Home    |2024-04-10 10:00:00|35      |Mobile |India  |
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |
|3     |Cart    |2024-04-10 10:05:00|45      |Tablet |UK     |
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile |India  |
|4     |Home    |2024-04-10 10:10:00|15      |Mobile |Canada |
|2     |Contact |2024-04-10 10:15:00|25      |Desktop|USA    |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |
+------+--------+-------------------+--------+-------+-------+



# **Data Exploration & Preparation**

In [4]:
#1. Display the schema
df_web.printSchema()

#2. Convert Timestamp column
from pyspark.sql import Row
from pyspark.sql.functions import col, to_timestamp, minute
df_web = df_web.withColumn("Timestamp", to_timestamp(col("Timestamp")))
df_web.printSchema()

#3. Add SessionMinute column
df_web = df_web.withColumn("SessionMinute", minute(col("Timestamp")))
df_web.show()

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     1|    Home|2024-04-10 10:00:00|      35| Mobile|  India|            0|
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     3|    Cart|2024-04-10 10:05:00|      45| Tablet|     UK|            5|
|     1|Checkout|2024-04-10 10:08:00|      60| Mobile|  India|  

# **Filtering and Conditions**

In [5]:
#4. Mobile users who visited Checkout
mobile_checkout = df_web.filter((col("Device") == "Mobile") & (col("Page") == "Checkout"))
mobile_checkout.show()

#5. Entries with Duration > 60 seconds
long_sessions = df_web.filter(col("Duration") > 60)
long_sessions.show()

#6. Indian users who visited Products page
india_products = df_web.filter((col("Country") == "India") & (col("Page") == "Products"))
india_products.show()

+------+--------+-------------------+--------+------+-------+-------------+
|UserID|    Page|          Timestamp|Duration|Device|Country|SessionMinute|
+------+--------+-------------------+--------+------+-------+-------------+
|     1|Checkout|2024-04-10 10:08:00|      60|Mobile|  India|            8|
+------+--------+-------------------+--------+------+-------+-------------+

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+-

# **Aggregation and Grouping**

In [6]:
from pyspark.sql.functions import avg, count, desc
#7. Average duration per device type
avg_duration = df_web.groupBy("Device").agg(avg("Duration").alias("AvgDuration"))
avg_duration.show()

#8. Sessions count per country
sessions_per_country = df_web.groupBy("Country").agg(count("*").alias("SessionCount"))
sessions_per_country.show()

#9. Most visited page overall
most_visited = df_web.groupBy("Page").agg(count("*").alias("VisitCount")) \
    .orderBy(desc("VisitCount")).limit(1)
most_visited.show()

+-------+------------------+
| Device|       AvgDuration|
+-------+------------------+
| Mobile|36.666666666666664|
| Tablet|              45.0|
|Desktop| 78.33333333333333|
+-------+------------------+

+-------+------------+
|Country|SessionCount|
+-------+------------+
|  India|           3|
|    USA|           2|
|     UK|           1|
| Canada|           1|
+-------+------------+

+----+----------+
|Page|VisitCount|
+----+----------+
|Home|         2|
+----+----------+



# **Window Functions**

In [8]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,sum
#10. Rank each user’s pages by timestamp (oldest to newest).
window_spec = Window.partitionBy("UserID").orderBy("Timestamp")
ranked_pages = df_web.withColumn("PageRank", row_number().over(window_spec))
ranked_pages.show()

#11. Find the total duration of all sessions per user using groupBy .
user_duration = df_web.groupBy("UserID").agg(sum("Duration").alias("TotalDuration"))
user_duration.show()

+------+--------+-------------------+--------+-------+-------+-------------+--------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|PageRank|
+------+--------+-------------------+--------+-------+-------+-------------+--------+
|     1|    Home|2024-04-10 10:00:00|      35| Mobile|  India|            0|       1|
|     1|Checkout|2024-04-10 10:08:00|      60| Mobile|  India|            8|       2|
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|       1|
|     2| Contact|2024-04-10 10:15:00|      25|Desktop|    USA|           15|       2|
|     3|    Cart|2024-04-10 10:05:00|      45| Tablet|     UK|            5|       1|
|     4|    Home|2024-04-10 10:10:00|      15| Mobile| Canada|           10|       1|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|       1|
+------+--------+-------------------+--------+-------+-------+-------------+--------+

+------+-------------+
|UserID|TotalDuration|
+------

# **Spark SQL Tasks**

In [9]:
#12. Create temporary view
df_web.createOrReplaceTempView("traffic_view")

#13. Top 2 longest sessions
spark.sql("select * from traffic_view order by Duration desc limit 2").show()

#14. Unique users per page
spark.sql("select Page, count(DISTINCT UserID) as UniqueUsers from traffic_view group by Page").show()

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+

+--------+-----------+
|    Page|UniqueUsers|
+--------+-----------+
|    Cart|          1|
|    Home|          2|
|Checkout|          1|
|Products|          2|
| Contact|          1|
+--------+-----------+



**Export & Save**

In [10]:
#15. Save the final DataFrame to CSV.
df_web.write.csv("web_traffic_data.csv", header=True)

#16. Save partitioned by Country in Parquet format.
df_web.write.partitionBy("Country").parquet("web_traffic_partitioned.parquet")