# Mobile App Log File
The Dataset contains the log files from different components used in the overall telecom application.

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from pyspark.sql.functions import isnan, when, count, col, substring

In [2]:
app_name = 'dataFrame'
spark = SparkSession.builder.appName(app_name).getOrCreate()

### Load data into Spark DataFrame
The file access.log is not uploaded to GitHub because the file size is 482 MB.

In [3]:
log_file = sc.textFile('..\\Module 6 - DataFrames and Spark SQL\\case study 2 dataset\\access.log')
rdd = log_file.filter(lambda x: x != '').map(lambda x: x.split('-'))
rows = rdd.map(lambda p: Row(ip=p[0], status=p[2], browser=p[3]))
df = spark.createDataFrame(rows).cache()
df.show(5)

+--------------------+----------------+--------------------+
|             browser|              ip|              status|
+--------------------+----------------+--------------------+
| Mozilla/5.0 (Win...|109.169.248.247 | [12/Dec/2015:18:...|
|raith.at/administ...|109.169.248.247 | [12/Dec/2015:18:...|
| Mozilla/5.0 (Win...|    46.72.177.4 | [12/Dec/2015:18:...|
|raith.at/administ...|    46.72.177.4 | [12/Dec/2015:18:...|
| Mozilla/5.0 (Win...| 83.167.113.100 | [12/Dec/2015:18:...|
+--------------------+----------------+--------------------+
only showing top 5 rows



### Find out how many 404 HTTP codes are in access log


In [4]:
df.createOrReplaceTempView('access')
df_404 = spark.sql('select * from access where status like "%404%"')
df_404.count()

192622

In [5]:
import re

In [6]:
for x in df_404.select(['status']).take(4):
    print(str(x), type(str(x)))

Row(status=' [12/Dec/2015:19:02:36 +0100] GET /templates/_system/css/general.css HTTP/1.1 404 239 http://almhuette') <class 'str'>
Row(status=' [12/Dec/2015:19:44:06 +0100] GET /templates/_system/css/general.css HTTP/1.1 404 239 http://www.almhuette') <class 'str'>
Row(status=' [12/Dec/2015:19:44:15 +0100] GET /favicon.ico HTTP/1.1 404 217 ') <class 'str'>
Row(status=' [13/Dec/2015:01:01:19 +0100] GET /icons/text.gif HTTP/1.1 404 220 ') <class 'str'>


### Find out which URLs are broken


In [7]:
set(x.group(0) for x in 
    [re.search("https?://[^\s]+", str(x)) for x in df_404.select(['status']).collect()] 
    if x != None)

{"http://65.55.252.15/proxy.ashx?h=OJSfmAmQR0zRSPWXXDEhsk147zlR8wM7&a=http%3A%2F%2Fwww.almhuette')",
 'http://91.143.107.26/',
 'http://\\\\xef\\\\xbb\\\\xbfhttp://www.apeloptik.de',
 'http://almhuette")',
 "http://almhuette')",
 'http://aquaplant.com.ua/form/tpl/templates_d/logx???',
 'http://aquaplant.com.ua/form/tpl/templates_d/logx????',
 'http://cirt.net/',
 'http://cirt.net/rfiinc.txt?',
 'http://cirt.net/rfiinc.txt?%00',
 'http://cirt.net/rfiinc.txt?&2093085906=1&995617320=2',
 'http://cirt.net/rfiinc.txt?&cmd=cat/etc/passwd',
 'http://cirt.net/rfiinc.txt?&cmd=dir',
 'http://cirt.net/rfiinc.txt?&cmd=id',
 'http://cirt.net/rfiinc.txt?&cmd=ls',
 'http://cirt.net/rfiinc.txt?&command=cat%20/etc/passwd',
 'http://cirt.net/rfiinc.txt?&file=article&sid=2',
 'http://cirt.net/rfiinc.txt?&filhead=XXpathXX&cmd=id',
 'http://cirt.net/rfiinc.txt?&l=testfile.txt?',
 'http://cirt.net/rfiinc.txt?&mode=[file]',
 'http://cirt.net/rfiinc.txt?&text=Hello%20World',
 "http://cirt.net/rfiinc.txt?'",
 

### Verify there are no null columns in the original dataset


In [8]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-------+---+------+
|browser| ip|status|
+-------+---+------+
|      0|  0|     0|
+-------+---+------+



In [9]:
spark.sql("""
          select count(*) 
            from access 
           where lower(ip) in ("", " ", "null") 
              or lower(status) in ("", " ", "null") 
              or lower(browser) in ("", " ", "null")
          """).show()

+--------+
|count(1)|
+--------+
|  384739|
+--------+



### Replace null values with constants such as 0


In [10]:
df_notNull = spark.sql("""
                       select CASE WHEN lower(ip) in ("", " ", "null") THEN 0 ELSE ip END ip,
                              CASE WHEN lower(status) in ("", " ", "null") THEN 0 ELSE status END status,
                              CASE WHEN lower(browser) in ("", " ", "null") THEN 0 ELSE browser END browser
                         from access
                       """)
df_notNull.show(5)

+----------------+--------------------+--------------------+
|              ip|              status|             browser|
+----------------+--------------------+--------------------+
|109.169.248.247 | [12/Dec/2015:18:...| Mozilla/5.0 (Win...|
|109.169.248.247 | [12/Dec/2015:18:...|raith.at/administ...|
|    46.72.177.4 | [12/Dec/2015:18:...| Mozilla/5.0 (Win...|
|    46.72.177.4 | [12/Dec/2015:18:...|raith.at/administ...|
| 83.167.113.100 | [12/Dec/2015:18:...| Mozilla/5.0 (Win...|
+----------------+--------------------+--------------------+
only showing top 5 rows



### Parse timestamp to readable date


In [11]:
df.withColumn('date', substring(df['status'], 3, 11)).show(5)

+--------------------+----------------+--------------------+-----------+
|             browser|              ip|              status|       date|
+--------------------+----------------+--------------------+-----------+
| Mozilla/5.0 (Win...|109.169.248.247 | [12/Dec/2015:18:...|12/Dec/2015|
|raith.at/administ...|109.169.248.247 | [12/Dec/2015:18:...|12/Dec/2015|
| Mozilla/5.0 (Win...|    46.72.177.4 | [12/Dec/2015:18:...|12/Dec/2015|
|raith.at/administ...|    46.72.177.4 | [12/Dec/2015:18:...|12/Dec/2015|
| Mozilla/5.0 (Win...| 83.167.113.100 | [12/Dec/2015:18:...|12/Dec/2015|
+--------------------+----------------+--------------------+-----------+
only showing top 5 rows



### Describe which HTTP status values appear in data and how many


In [14]:
http_status = [str(x).split(' ')[6] for x in df.select(['status']).collect() if len(str(x).split(' ')) > 6]
for x in set(http_status):
    if x != 'HTTP/1.1':
        print("Status {}: {} times".format(x, http_status.count(x)))

Status 304: 6086 times
Status 406: 53 times
Status 405: 28 times
Status 200: 950769 times
Status 404: 192591 times
Status 403: 1861 times
Status 500: 2555 times
Status 303: 247 times
Status 301: 538 times
Status 501: 93 times
Status 400: 23 times
Status 401: 2 times
Status 206: 1880 times


### How many unique hosts are there in the entire log and their average request


In [13]:
spark.sql("""
          select avg(cnt) 
            from (select ip, count(*) cnt
                    from access
                   group by ip) t          
          """).show()

+-----------------+
|         avg(cnt)|
+-----------------+
|57.25355078851993|
+-----------------+

