In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.master("local[*]").appName("SparkSQL_Review").getOrCreate()

In [40]:
schema =  StructType([
    StructField("host", StringType()),
    StructField("logname", StringType()),
    StructField("time", LongType()),
    StructField("method", StringType()),
    StructField("url", StringType()),
    StructField("response", StringType()),
    StructField("bytes", LongType()),
    StructField("referer", StringType()),
    StructField("useragent", StringType())
])

nasa_log = spark.read \
    .option("delimiter", "\t") \
    .option("header", "true") \
    .schema(schema) \
    .csv("nasa_19950801.tsv")

df = nasa_log.groupBy("response").agg(count("*").alias("count"), avg("bytes").alias("avg_bytes"))
df.show()

+--------+-----+------------------+
|response|count|         avg_bytes|
+--------+-----+------------------+
|     200|27972|17230.604247104246|
|     302|  355| 73.25352112676056|
|     404|  221|               0.0|
|     304| 2421|               0.0|
+--------+-----+------------------+



In [68]:
http_response = spark.read \
    .option("delimiter", ",") \
    .option("header", "true") \
    .csv("http_response.csv")

df = http_response.join(nasa_log, col("responsecode") == regexp_replace("response", "\d{2}$", "xx"), "left") \
    .groupBy("responsecode", "responsedesc").agg(count("response").alias("count"), avg("bytes").alias("avg_bytes")) \
    .select("responsecode", "responsedesc", "count", "avg_bytes").orderBy("responsecode")
df.show()

+------------+------------+-----+------------------+
|responsecode|responsedesc|count|         avg_bytes|
+------------+------------+-----+------------------+
|         1xx|    informal|    0|              null|
|         2xx|     success|27972|17230.604247104246|
|         3xx| redirection| 2776| 9.367795389048991|
|         4xx|client error|  221|               0.0|
|         5xx|server error|    0|              null|
+------------+------------+-----+------------------+



In [64]:
df = df.withColumn("avg_kilobytes", col("avg_bytes") / 1024)
df.show()

+------------+------------+-----+------------------+--------------------+
|responsecode|responsedesc|count|         avg_bytes|       avg_kilobytes|
+------------+------------+-----+------------------+--------------------+
|         1xx|    informal|    0|              null|                null|
|         2xx|     success|27972|17230.604247104246|   16.82676196006274|
|         3xx| redirection| 2776| 9.367795389048991|0.009148237684618156|
|         4xx|client error|  221|               0.0|                 0.0|
|         5xx|server error|    0|              null|                null|
+------------+------------+-----+------------------+--------------------+

