In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import time

We would like to perform analytics on the server log.

#### Problem 1

Find all the hits to the-associates which is the following js:
    ``asset/js/the_associates.js``

In [13]:
%%file codes/num_hits.py 
# this magic command allows saving output of python to folder
from mrjob.job import MRJob

address = '/assets/js/the-associates.js'

class MRJobAssociates(MRJob):
    
    def mapper(self,_,line):
        data = line.strip().split(' ')  # delete whitespace and tokenize
        if len(data) == 10:
            request = data[6]  # unpacking the data
            if address in request:
                yield (address, 1)  # key pair to send to reducer
        
    def reducer(self, category,hits):
        yield address, sum(hits)  # reducer return

        
if __name__ == '__main__':  
    MRJobAssociates.run()  # where MRJobCategoryCost is your job class

Overwriting codes/num_hits.py


In [15]:
tic = time.time()
! python codes/num_hits.py < data/access_log
toc = time.time()

print('Running this query took {} seconds.'.format(toc - tic))

"/assets/js/the-associates.js"	2456
Running this query took 103.5659658908844 seconds.


No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory C:\Users\Amin\AppData\Local\Temp\num_hits.Amin.20190227.045027.898440
reading from STDIN
job output is in C:\Users\Amin\AppData\Local\Temp\num_hits.Amin.20190227.045027.898440\output
Streaming final output from C:\Users\Amin\AppData\Local\Temp\num_hits.Amin.20190227.045027.898440\output...
Removing temp directory C:\Users\Amin\AppData\Local\Temp\num_hits.Amin.20190227.045027.898440...


#### Problem 2 
How many hits came from a specific ip address?


In [45]:
%%file codes/num_activities.py 
# this magic command allows saving output of python to folder
from mrjob.job import MRJob

ip = '10.99.99.186'

class MRJobIP(MRJob):
    
    def mapper(self,_,line):
        data = line.strip().split(' ')  # delete whitespace and tokenize
        if len(data) == 10:
            ip_address = data[0]  # unpacking the data
            if ip_address == ip:
                yield (ip, 1)  # key pair to send to reducer
        
    def reducer(self, ip_address,hits):
        yield ip_address, (hits)  # reducer return

        
if __name__ == '__main__':  
    MRJobIP.run()  # where MRJobCategoryCost is your job class

Overwriting codes/num_activities.py


In [22]:
tic = time.time()
! python codes/num_activities.py < data/access_log
toc = time.time()

print('Running this query took {} seconds.'.format(toc - tic))

"10.99.99.186"	6
Running this query took 91.42828869819641 seconds.


No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory C:\Users\Amin\AppData\Local\Temp\num_activities.Amin.20190227.050052.337895
reading from STDIN
job output is in C:\Users\Amin\AppData\Local\Temp\num_activities.Amin.20190227.050052.337895\output
Streaming final output from C:\Users\Amin\AppData\Local\Temp\num_activities.Amin.20190227.050052.337895\output...
Removing temp directory C:\Users\Amin\AppData\Local\Temp\num_activities.Amin.20190227.050052.337895...


In [83]:
%%file codes/most_visited.py 
# this magic command allows saving output of python to folder
from mrjob.job import MRJob
from mrjob.step import MRStep
prefix = "http://www.the-associates.co.uk"
class MRJobMostFreqRequest(MRJob):
    
    def mapper_get_addresses(self,_,line):
        data = line.strip().split(' ')  # delete whitespace and tokenize
        if len(data) == 10:
            request = data[6]  # unpacking the data
            if prefix in request:
                request = request[len(prefix):] # deleting prefix from addresses
            yield (request, 1)  # key pair to send to reducer
    
    def combiner_count_hits(self, request, hits):
        yield (request, sum(hits)) # sum the request seen so far
    
    def reducer_count_hits(self, request,hits):
        yield None, (sum(hits),request)  # reducer return all the couts with the same key
    
    def reducer_find_max_request(self, _, request_hit_pairs):
        yield max(request_hit_pairs) # max function returns the row with highest first element

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_addresses,
                  combiner=self.combiner_count_hits,
                  reducer=self.reducer_count_hits),
            MRStep(reducer=self.reducer_find_max_request)
        ]
if __name__ == '__main__':  
    MRJobMostFreqRequest.run()  # where MRJobCategoryCost is your job class

Overwriting codes/most_visited.py


In [84]:
tic = time.time()
! python codes/most_visited.py < data/samplog.txt
toc = time.time()

print('Running this query took {} seconds.'.format(toc - tic))

54	"/assets/js/the-associates.js"
Running this query took 4.567207098007202 seconds.


No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory C:\Users\Amin\AppData\Local\Temp\most_visited.Amin.20190227.082346.486609
reading from STDIN
Running step 2 of 2...
job output is in C:\Users\Amin\AppData\Local\Temp\most_visited.Amin.20190227.082346.486609\output
Streaming final output from C:\Users\Amin\AppData\Local\Temp\most_visited.Amin.20190227.082346.486609\output...
Removing temp directory C:\Users\Amin\AppData\Local\Temp\most_visited.Amin.20190227.082346.486609...
