In [1]:
from pyspark import SparkContext,SQLContext,SparkConf,StorageLevel
from collections import Counter

In [2]:
sparkconf= SparkConf().setAppName("NASA").setMaster("local[3]")
sc= SparkContext(conf=sparkconf)

In [3]:
#NOTE - HAVE ONLY PROCESSED A SAMPLE OF TOTAL DATA SINCE THE ".COLLECT()" METHOD THROWS MEMOREY EXCEPTION
access = sc.textFile( "/user/arun.kpselvam_gmail/gldata/access2",6,use_unicode=False)

In [4]:
access.take(10)

['in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" 200 1839',
 'uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] "GET / HTTP/1.0" 304 0',
 'uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/ksclogo-medium.gif HTTP/1.0" 304 0',
 'uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/MOSAIC-logosmall.gif HTTP/1.0" 304 0',
 'uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/USA-logosmall.gif HTTP/1.0" 304 0',
 'ix-esc-ca2-07.ix.netcom.com - - [01/Aug/1995:00:00:09 -0400] "GET /images/launch-logo.gif HTTP/1.0" 200 1713',
 'uplherc.upl.com - - [01/Aug/1995:00:00:10 -0400] "GET /images/WORLD-logosmall.gif HTTP/1.0" 304 0',
 'slppp6.intermind.net - - [01/Aug/1995:00:00:10 -0400] "GET /history/skylab/skylab.html HTTP/1.0" 200 1687',
 'piweba4y.prodigy.com - - [01/Aug/1995:00:00:10 -0400] "GET /images/launchmedium.gif HTTP/1.0" 200 11853',
 'slppp6.intermind.net - - [01/Aug/1995:00:00:11 -0400] "GET

In [5]:
#Custom method to add data from file to accData
import re
PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)(.*)" (\d{3}) (\S+)'
def parseLogLine(log):
    m = re.match(PATTERN, log)
    if m:
        return Row(host=m.group(1), timeStamp=m.group(4),url=m.group(6), httpCode=int(m.group(8)))
    else:
        return []
nasaData = access.map(lambda line:parseLogLine(line))

In [6]:
nasaData.take(10)

[Row(host='in24.inetnebr.com', httpCode=200, timeStamp='01/Aug/1995:00:00:01 -0400', url='/shuttle/missions/sts-68/news/sts-68-mcc-05.txt'),
 Row(host='uplherc.upl.com', httpCode=304, timeStamp='01/Aug/1995:00:00:07 -0400', url='/'),
 Row(host='uplherc.upl.com', httpCode=304, timeStamp='01/Aug/1995:00:00:08 -0400', url='/images/ksclogo-medium.gif'),
 Row(host='uplherc.upl.com', httpCode=304, timeStamp='01/Aug/1995:00:00:08 -0400', url='/images/MOSAIC-logosmall.gif'),
 Row(host='uplherc.upl.com', httpCode=304, timeStamp='01/Aug/1995:00:00:08 -0400', url='/images/USA-logosmall.gif'),
 Row(host='ix-esc-ca2-07.ix.netcom.com', httpCode=200, timeStamp='01/Aug/1995:00:00:09 -0400', url='/images/launch-logo.gif'),
 Row(host='uplherc.upl.com', httpCode=304, timeStamp='01/Aug/1995:00:00:10 -0400', url='/images/WORLD-logosmall.gif'),
 Row(host='slppp6.intermind.net', httpCode=200, timeStamp='01/Aug/1995:00:00:10 -0400', url='/history/skylab/skylab.html'),
 Row(host='piweba4y.prodigy.com', httpCod

In [7]:
nasaList = nasaData.collect()

In [8]:
urlList=[]
hostList=[]
timeList=[]
httpList=[]
for i in nasaList:
    urlList.append(i.url)
    hostList.append(i.host)
    timeList.append(i.timeStamp[0:14])
    httpList.append(i.httpCode)

In [9]:
# Find out top 10 requested URLs along with count of number of times they have been requested
urlCount=Counter(urlList)
urlCount.most_common(10)

[('/images/KSC-logosmall.gif', 236),
 ('/images/NASA-logosmall.gif', 227),
 ('/images/MOSAIC-logosmall.gif', 174),
 ('/images/WORLD-logosmall.gif', 172),
 ('/images/USA-logosmall.gif', 171),
 ('/images/ksclogo-medium.gif', 152),
 ('/', 114),
 ('/history/apollo/images/apollo-logo1.gif', 109),
 ('/images/launch-logo.gif', 105),
 ('/images/ksclogosmall.gif', 95)]

In [10]:
# Find out top 5 hosts / IP making the request along with count
hostCount=Counter(hostList)
hostCount.most_common(5)

[('ix-min1-02.ix.netcom.com', 78),
 ('uplherc.upl.com', 71),
 ('piweba3y.prodigy.com', 69),
 ('port26.ts1.msstate.edu', 59),
 ('ottodix.miks3.iao.fhg.de', 57)]

In [11]:
# Find out top 5 time frame for high traffic (Based on hour from format template)
timeCount=Counter(timeList)
timeCount.most_common(5)

[('01/Aug/1995:00', 1642),
 ('01/Aug/1995:01', 1385),
 ('01/Aug/1995:02', 993),
 ('01/Aug/1995:03', 980)]

In [12]:
# find out 5 time frames of least traffic (Based on hour from format template)
timeCount=Counter(timeList)
list(reversed(timeCount.most_common()))[:5]

[('01/Aug/1995:03', 980),
 ('01/Aug/1995:02', 993),
 ('01/Aug/1995:01', 1385),
 ('01/Aug/1995:00', 1642)]

In [13]:
#Find out unique HTTP codes returned by the server along with count
httpCount=Counter(httpList)
httpCount.most_common()

[(200, 4464), (304, 362), (404, 103), (302, 70), (403, 1)]