## CARLOS SÁNCHEZ VEGA

# STRUCTURED STREAMING EXERCISES

<strong>Get the urls the most common in  a log file</strong>

We import libraries:

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as func
from pyspark.sql.functions import col, asc,desc

We create the spark configuration:

In [2]:
spark = SparkSession.builder.appName("StructuredStreaming").getOrCreate()

In [3]:
lines = spark.sparkContext.textFile("access_log.txt")

In [4]:
for i in lines.take(10):print(i)

66.249.75.159 - - [29/Nov/2015:03:50:05 +0000] "GET /robots.txt HTTP/1.1" 200 55 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
66.249.75.168 - - [29/Nov/2015:03:50:06 +0000] "GET /blog/ HTTP/1.1" 200 8083 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
185.71.216.232 - - [29/Nov/2015:03:53:15 +0000] "POST /wp-login.php HTTP/1.1" 200 1691 "http://nohatenews.com/wp-login.php" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0"
54.165.199.171 - - [29/Nov/2015:04:32:27 +0000] "GET /sitemap_index.xml HTTP/1.0" 200 592 "-" "W3 Total Cache/0.9.4.1"
54.165.199.171 - - [29/Nov/2015:04:32:27 +0000] "GET /post-sitemap.xml HTTP/1.0" 200 2502 "-" "W3 Total Cache/0.9.4.1"
54.165.199.171 - - [29/Nov/2015:04:32:27 +0000] "GET /page-sitemap.xml HTTP/1.0" 200 11462 "-" "W3 Total Cache/0.9.4.1"
54.165.199.171 - - [29/Nov/2015:04:32:27 +0000] "GET /category-sitemap.xml HTTP/1.0" 200 585 "-" "W3 Total Cache/0.9.4.1"
54.1

In [5]:
# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

We are going to analyse all files created in the logs diretory, in such a way that every time a file is added to that folder, the analysis updates its results

In [6]:
accessLines = spark.readStream.text("logs")

We parse data entry

In [7]:
logsDF = accessLines.select(func.regexp_extract('value', hostExp, 1).alias('host'),
                         func.regexp_extract('value', timeExp, 1).alias('timestamp'),
                         func.regexp_extract('value', generalExp, 1).alias('method'),
                         func.regexp_extract('value', generalExp, 2).alias('endpoint'),
                         func.regexp_extract('value', generalExp, 3).alias('protocol'),
                         func.regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
                         func.regexp_extract('value', contentSizeExp, 1).cast('integer').alias('content_size'))

In [8]:
logsDF2 = logsDF.withColumn("eventTime", func.current_timestamp())

We create a window of 30 seconds and, every 10 seconds the analysis get re-calculated

In [9]:
endpointCounts = logsDF2.groupBy(func.window(func.col("eventTime"), \
      "30 seconds", "10 seconds"), func.col("endpoint")).count()

In [None]:
sortedEndpointCounts = endpointCounts.orderBy(func.col("count").desc())

# we kick off our streaming query and display the stream to the console
query = sortedEndpointCounts.writeStream.outputMode("complete").format("console") \
      .queryName("counts").start()

In [None]:
# Wait until we terminate the scripts
query.awaitTermination()

In [None]:
If we insert the same file two times to the "log" fokder, we will have the nex output:

In [None]:
-------------------------------------------
Batch: 0
-------------------------------------------
+------+-----+
|status|count|
+------+-----+
|   500|10714|
|   301|  271|
|   400|    2|
|   404|   26|
|   200|64971|
|   304|   92|
|   302|    2|
|   405|    1|
+------+-----+

-------------------------------------------
Batch: 1
-------------------------------------------
+------+------+
|status| count|
+------+------+
|   500| 21428|
|   301|   542|
|   400|     4|
|   404|    52|
|   200|129942|
|   304|   184|
|   302|     4|
|   405|     2|
+------+------+


In [None]:
spark.stop()