In [0]:
%fs ls /FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/apache_logs.txt,apache_logs.txt,2370046,1750045595000
dbfs:/FileStore/tables/flight_time.json,flight_time.json,87150442,1749962413000



Scenarios for working with Rows:

1. Manually creating Rows and DataFrames -- unit testing/dev
2. Collecting DataFrame Rows to the driver -- unit testing/dev
3. Working with single column Row in transformations.



Structured/Semi-Structured Data: Define Schema > Load > Transform (ELT)

Unstructured Data: Load > Discover Schema > Transform

In [0]:
logs_df = spark.read\
    .format("text")\
        .load("/FileStore/tables/apache_logs.txt")


logs_df.show(10)

+--------------------+
|               value|
+--------------------+
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
|83.149.9.216 - - ...|
+--------------------+
only showing top 10 rows



In [0]:
logs_df.printSchema()

root
 |-- value: string (nullable = true)



In [0]:
display(logs_df)

value
"83.149.9.216 - - [17/May/2015:10:05:03 +0000] ""GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1"" 200 203023 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:43 +0000] ""GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png HTTP/1.1"" 200 171717 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:47 +0000] ""GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1"" 200 26185 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:12 +0000] ""GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1"" 200 7697 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:07 +0000] ""GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js HTTP/1.1"" 200 2892 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:34 +0000] ""GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1"" 200 430406 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:57 +0000] ""GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Bold.ttf HTTP/1.1"" 200 38720 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:50 +0000] ""GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Regular.ttf HTTP/1.1"" 200 41820 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:24 +0000] ""GET /presentations/logstash-monitorama-2013/images/frontend-response-codes.png HTTP/1.1"" 200 52878 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""
"83.149.9.216 - - [17/May/2015:10:05:50 +0000] ""GET /presentations/logstash-monitorama-2013/images/kibana-dashboard.png HTTP/1.1"" 200 321631 ""http://semicomplete.com/presentations/logstash-monitorama-2013/"" ""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36"""


Out[5]: '^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] "(\\S+) (\\S+) (\\S+)" (\\d{3}) (\\S+) "(\\S+)" "([^"]*)'

In [0]:
from pyspark.sql.functions import regexp_extract

log_reg = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+) "(\S+)" "([^"]*)'

logs_df_final = logs_df.select(regexp_extract('value', log_reg, 1).alias('ip'),
                               regexp_extract('value', log_reg, 4).alias('date'),
                               regexp_extract('value', log_reg, 6).alias('image'),
                               regexp_extract('value', log_reg, 10).alias('referrer'))
display (logs_df_final)

ip,date,image,referrer
83.149.9.216,17/May/2015:10:05:03 +0000,/presentations/logstash-monitorama-2013/images/kibana-search.png,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:43 +0000,/presentations/logstash-monitorama-2013/images/kibana-dashboard3.png,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:47 +0000,/presentations/logstash-monitorama-2013/plugin/highlight/highlight.js,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:12 +0000,/presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:07 +0000,/presentations/logstash-monitorama-2013/plugin/notes/notes.js,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:34 +0000,/presentations/logstash-monitorama-2013/images/sad-medic.png,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:57 +0000,/presentations/logstash-monitorama-2013/css/fonts/Roboto-Bold.ttf,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:50 +0000,/presentations/logstash-monitorama-2013/css/fonts/Roboto-Regular.ttf,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:24 +0000,/presentations/logstash-monitorama-2013/images/frontend-response-codes.png,http://semicomplete.com/presentations/logstash-monitorama-2013/
83.149.9.216,17/May/2015:10:05:50 +0000,/presentations/logstash-monitorama-2013/images/kibana-dashboard.png,http://semicomplete.com/presentations/logstash-monitorama-2013/


In [0]:
logs_df_final.printSchema()

root
 |-- ip: string (nullable = true)
 |-- date: string (nullable = true)
 |-- image: string (nullable = true)
 |-- referrer: string (nullable = true)



In [0]:
logs_df_final.filter(logs_df_final.referrer != "-")\
        .filter("referrer is not null")\
            .groupBy("referrer")\
                .count()\
                    .withColumnRenamed("count", "count_col")\
                        .orderBy("count_col", ascending=False)\
                            .show(5)

+--------------------+---------+
|            referrer|count_col|
+--------------------+---------+
|http://semicomple...|      689|
|http://www.semico...|      656|
|http://semicomple...|      406|
|http://www.semico...|      335|
|http://www.semico...|      228|
+--------------------+---------+
only showing top 5 rows

