https://github.com/allofphysicsgraph/proofofconcept/issues/246

In [None]:
!ls logs

In [None]:
!head -n 3 logs/gunicorn_access.log

In [None]:
import pandas
from matplotlib import pyplot as plt
import time

# load data from disk

In [None]:
this_list = []
number_of_invalid_strings = 0

start_time = time.time()
# https://stackoverflow.com/a/48124263/1164295
with open('logs/gunicorn_access.log', 'r') as file_handle:
    for line in file_handle:
        try:
            as_dict = eval(line)
            as_dict["reqtime"] = float(as_dict["reqtime"])
            as_dict["resplen"] = int(as_dict["resplen"])
            this_list.append(as_dict)
        except SyntaxError:
            number_of_invalid_strings += 1
            #print(line)
            
print('elapsed',round(time.time()-start_time,2),'seconds')

In [None]:
print(number_of_invalid_strings)

# explore the data

In [None]:
df = pandas.DataFrame(this_list)
df.shape

In [None]:
df.head()

# histogram of IP addresses

In [None]:
df['ip'].nunique()

In [None]:
# how many IP addresses appeared once?

sum(df['ip'].value_counts()==1)

In [None]:
#pandas.options.display.max_rows = 50
#pandas.set_option('display.max_rows', 50)

df['ip'].value_counts()

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html

df['ip'].value_counts().hist(bins=50)
plt.semilogy()
plt.xlabel('number of requests')
plt.ylabel('number of IPs making that many requests');

10,000 IPs make 1 request each; 1 IP makes 17,5000 requests. 

## most popular IP

That most active IP, 157.90.177.215, is ninja-crawler60.webmeup.com

In [None]:
df[df['ip']=="157.90.177.215"].head(3)

# histogram of user agent strings

In [None]:
df['ua'].value_counts().hist(bins=50)
plt.semilogy()
plt.xlabel('number of entries of a user agent string')
plt.ylabel('count');

One user agent string appears 200,000 times; most appear once

There are a lot of crawler bots:

* https://developers.google.com/search/docs/advanced/crawling/googlebot
* https://serpstatbot.com/
* https://webmaster.petalsearch.com/site/petalbot

In [None]:
df['ua'].value_counts().head(20)

Happily, the name "bot" appears in crawler user agent strings

In [None]:
df[df["ua"].str.contains('|'.join(['bot','Bot','crawl']))]['ua'].value_counts().head(30)

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html
df_orig = df.copy(deep=True)

In [None]:
df.shape

# Cleanup

## remove bots and crawlers from the logs

In [None]:
# https://stackoverflow.com/a/52173171/1164295

df.drop(df[df["ua"].str.contains('|'.join(['bot','Bot','crawl']))].index, inplace = True)

In [None]:
df.shape

that's a reduction to 1/3; so 2/3 of my traffic is crawlers

## TODO: remove the "single request" IP entries

In [None]:
s = df['ip'].value_counts()==1
len(s[s].index)

In [None]:
194494-6365

TODO: the following isn't working 

In [None]:
# https://stackoverflow.com/a/44888919/1164295 
# relies on https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html
df = df[df.duplicated(subset=['ip'], keep=False)]
df.shape

what percentage of the non-bot traffic is single request?

In [None]:
6365/194494

## remove "response length zero" entries

In [None]:
df.drop(df[df['resplen'] ==0].index, inplace = True)

In [None]:
df.shape

## remove PNG/JS/JSON/mathjax/txt/ico/svg requests

In [None]:
# I don't care about requests for PNG files

# https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html

# https://stackoverflow.com/a/26577689/1164295
df[df["statline"].str.contains('|'.join(['.png','.js',
                                          '.json','mathjax',
                                          '.txt','.ico',
                                          '.svg', 'GET / HTTP']))]['statline'].value_counts().head(5)

In [None]:
df.drop(df[df["statline"].str.contains('|'.join(['.png','.js','owa','aspx',
                                          '.json','mathjax','php','PHP','git','wordpress','admin',
                                          '.txt','.ico','xml','XML',
                                          '.svg','login','logon']))].index, inplace = True)

In [None]:
df.shape

# after cleanup, revisit IP activity

In [None]:
df['ip'].nunique()

In [None]:
df['ip'].value_counts().hist(bins=50)
plt.semilogy()
plt.xlabel('number of requests')
plt.ylabel('number of IPs making that many requests');

In [None]:
df['ip'].value_counts().head(10)

TODO: the fact that there are single entry IPs means something failed in the cleanup process above

In [None]:
df['ip'].value_counts().tail(3)

# TODO: time-series analysis per IP

What is the duration of time spent on the website?

# histogram of GETs

In [None]:
df['statline'].value_counts().hist(bins=50)
plt.semilogy()
plt.xlabel('number of GET requests')
plt.ylabel('count');

In [None]:
df['statline'].value_counts().head(40)

In [None]:
df['statline'].value_counts().tail()

## login attempts

In [None]:
df[df["statline"].str.contains('login')]['statline'].value_counts().head(20)

# histogram of request times

In [None]:
max(df['reqtime'])

In [None]:
df['reqtime'].value_counts().hist(bins=50)
plt.semilogy()
plt.xlabel('request times')
plt.ylabel('number of requests with that time');

In [None]:
df['reqtime'].value_counts().head(5)

In [None]:
df['reqtime'].value_counts().sort_index().tail()

# histogram of response length

In [None]:
df['resplen'].value_counts().hist(bins=50)
plt.semilogy()
plt.xlabel('response length')
plt.ylabel('count');

In [None]:
df['resplen'].value_counts().head(20)

In [None]:
df['resplen'].value_counts().sort_index().head(10)

What's the smallest request response length that contains actual content?

In [None]:
df[df['resplen']>500]['resplen'].value_counts().sort_index()

In [None]:
df['resplen'].value_counts().sort_index().tail(50)

In [None]:
df[df['resplen']==1163881].head(3)

In [None]:
df[df['resplen']==1167324].head(3)