In [1]:
import numpy as np
import pandas as pd

In [2]:
fields = ["host", "rfc_id", "user_id", "time", "request", "status", "content_size"]
df = pd.read_csv(
    'access_log_Aug95',
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    na_values="-",
    header=None,
    names=fields,
    error_bad_lines=False
)

Skipping line 34435: Expected 7 fields in line 34435, saw 9. Error could possibly be due to quotes being ignored when a multi-char delimiter is used.


In [3]:
df.head()

Unnamed: 0,host,rfc_id,user_id,time,request,status,content_size
0,in24.inetnebr.com,,,[01/Aug/1995:00:00:01 -0400],"""GET /shuttle/missions/sts-68/news/sts-68-mcc-...",200.0,1839.0
1,uplherc.upl.com,,,[01/Aug/1995:00:00:07 -0400],"""GET / HTTP/1.0""",304.0,0.0
2,uplherc.upl.com,,,[01/Aug/1995:00:00:08 -0400],"""GET /images/ksclogo-medium.gif HTTP/1.0""",304.0,0.0
3,uplherc.upl.com,,,[01/Aug/1995:00:00:08 -0400],"""GET /images/MOSAIC-logosmall.gif HTTP/1.0""",304.0,0.0
4,uplherc.upl.com,,,[01/Aug/1995:00:00:08 -0400],"""GET /images/USA-logosmall.gif HTTP/1.0""",304.0,0.0


In [4]:
df.shape

(1569897, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1569897 entries, 0 to 1569896
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   host          1569897 non-null  object 
 1   rfc_id        16 non-null       object 
 2   user_id       16 non-null       object 
 3   time          1569882 non-null  object 
 4   request       1569882 non-null  object 
 5   status        1569881 non-null  float64
 6   content_size  1555719 non-null  float64
dtypes: float64(2), object(5)
memory usage: 83.8+ MB


In [6]:
df.isnull().sum()

host                  0
rfc_id          1569881
user_id         1569881
time                 15
request              15
status               16
content_size      14178
dtype: int64

In [7]:
df['content_size'].fillna(0, inplace=True)

In [8]:
df.drop(columns=['rfc_id', 'user_id'], inplace=True)

In [9]:
df.isnull().sum()

host             0
time            15
request         15
status          16
content_size     0
dtype: int64

In [10]:
# now there are only 15 rows which contains null values 
# since it is a small number, dropping those rows will not impact the requirments
df.dropna(axis=0, inplace=True)

In [11]:
df.isnull().sum()

host            0
time            0
request         0
status          0
content_size    0
dtype: int64

In [12]:
# df.info() shows that status field is of type float64
# convert status field into int
df = df.astype({'status': int})

In [13]:
df.dtypes

host             object
time             object
request          object
status            int64
content_size    float64
dtype: object

In [14]:
df.shape

(1569881, 5)

In [15]:
# Extract request url from request field
def parse_request(req):
    try:
        return req.split()[1]
    except IndexError:
        return req

In [16]:
# extract request url from request
df['request'] = df.request.apply(parse_request)

In [17]:
round(df[df.status.between(200, 399)]['status'].count()*100/df['status'].count(), 2)

99.35

In [18]:
unsuccessfull_df = df[df.status >= 400]
round(unsuccessfull_df['status'].count()*100 / df['status'].count(), 2)

0.65

In [19]:
df.groupby('request')['status'].count() \
.reset_index(name='count') \
.sort_values('count', ascending=False) \
.head(10) \
.reset_index(drop=True)

Unnamed: 0,request,count
0,/images/NASA-logosmall.gif,97293
1,/images/KSC-logosmall.gif,75283
2,/images/MOSAIC-logosmall.gif,67356
3,/images/USA-logosmall.gif,66975
4,/images/WORLD-logosmall.gif,66351
5,/images/ksclogo-medium.gif,62670
6,/ksc.html,43619
7,/history/apollo/images/apollo-logo1.gif,37806
8,/images/launch-logo.gif,35119
9,/,30122


In [20]:
unsuccessfull_df.groupby('request')['status'] \
.count() \
.reset_index(name='count') \
.sort_values('count', ascending=False) \
.head(10) \
.reset_index(drop=True)

Unnamed: 0,request,count
0,/pub/winvn/readme.txt,1337
1,/pub/winvn/release.txt,1185
2,/shuttle/missions/STS-69/mission-STS-69.html,682
3,/images/nasa-logo.gif,319
4,/shuttle/missions/sts-68/ksc-upclose.gif,251
5,/elv/DELTA/uncons.htm,209
6,/history/apollo/sa-1/sa-1-patch-small.gif,200
7,/://spacelink.msfc.nasa.gov,166
8,/images/crawlerway-logo.gif,160
9,/history/apollo/a-001/a-001-patch-small.gif,154


In [21]:
top_hosts = df.groupby('host')['status'] \
                .count() \
                .reset_index(name='count') \
                .sort_values('count', ascending=False) \
                .head(10) \
                .reset_index(drop=True)
top_hosts

Unnamed: 0,host,count
0,edams.ksc.nasa.gov,6530
1,piweba4y.prodigy.com,4844
2,163.206.89.4,4791
3,piweba5y.prodigy.com,4607
4,piweba3y.prodigy.com,4416
5,www-d1.proxy.aol.com,3889
6,www-b2.proxy.aol.com,3534
7,www-b3.proxy.aol.com,3463
8,www-c5.proxy.aol.com,3423
9,www-b5.proxy.aol.com,3411


In [22]:
top_df1 = pd.merge(top_hosts, df, on='host')
top_df1

Unnamed: 0,host,count,time,request,status,content_size
0,edams.ksc.nasa.gov,6530,[01/Aug/1995:06:18:20 -0400],/ksc.html,200,7280.0
1,edams.ksc.nasa.gov,6530,[01/Aug/1995:06:18:21 -0400],/images/ksclogo-medium.gif,200,5866.0
2,edams.ksc.nasa.gov,6530,[01/Aug/1995:06:18:21 -0400],/images/NASA-logosmall.gif,200,786.0
3,edams.ksc.nasa.gov,6530,[01/Aug/1995:06:18:21 -0400],/images/MOSAIC-logosmall.gif,200,363.0
4,edams.ksc.nasa.gov,6530,[01/Aug/1995:06:18:22 -0400],/images/USA-logosmall.gif,200,234.0
...,...,...,...,...,...,...
42903,www-b5.proxy.aol.com,3411,[31/Aug/1995:19:30:34 -0400],/shuttle/technology/sts-newsref/stsref-toc.html,304,0.0
42904,www-b5.proxy.aol.com,3411,[31/Aug/1995:19:30:48 -0400],/images/shuttle-patch-small.gif,200,4179.0
42905,www-b5.proxy.aol.com,3411,[31/Aug/1995:19:31:48 -0400],/shuttle/technology/images/sts_spec_6-small.gif,200,47145.0
42906,www-b5.proxy.aol.com,3411,[31/Aug/1995:19:32:36 -0400],/shuttle/technology/images/launch_sites_8-smal...,200,74267.0


In [23]:
top_df1_grp = top_df1['request'].groupby(top_df1['host']).value_counts()
df_grp=top_df1_grp.groupby(level=[0]).nlargest(5).reset_index(level=0, drop=True)
pd.DataFrame(df_grp)

Unnamed: 0_level_0,Unnamed: 1_level_0,request
host,request,Unnamed: 2_level_1
163.206.89.4,/images/NASA-logosmall.gif,568
163.206.89.4,/htbin/cdt_main.pl,360
163.206.89.4,/shuttle/countdown/images/countclock.gif,347
163.206.89.4,/ksc.html,251
163.206.89.4,/images/USA-logosmall.gif,237
edams.ksc.nasa.gov,/ksc.html,1020
edams.ksc.nasa.gov,/images/WORLD-logosmall.gif,870
edams.ksc.nasa.gov,/images/NASA-logosmall.gif,869
edams.ksc.nasa.gov,/images/MOSAIC-logosmall.gif,867
edams.ksc.nasa.gov,/images/USA-logosmall.gif,867
