In [158]:
# Import required libraries
import re
import os
from datetime import datetime as dt
from collections import OrderedDict, Counter
import pandas as pd
from pandas.compat import StringIO
import numpy as np

# Define the input log file
file = 'data/nasa-http/NASA_access_log_Aug95 - Copy.log'

# Create regular expression to parse the web log file
log_line_regex = re.compile(''.join([
    r'^(?P<host>[\S]+)\s-\s-\s', r'\[(?P<timestamp>.{26})\]',
    r'\s"(?P<request_method>[A-Z]{3,4})\s(?P<request_url>.{1,100})(\sHTTP/1.0")?',
    r'\s(?P<reply_code>[0-9]{3})\s(?P<reply_bytes>[0-9-]{1,20})$'
]))

In [159]:
  
# Create empty dataframe
nasa = pd.DataFrame([])
loglst = list()

# Set count to loop through lines in the file
cnt = 0
with open(file) as fl:
    for line in fl:
        m = log_line_regex.match(line)
        record = OrderedDict([
            (key, value) 
            for key, value in m.groupdict().items()
        ])

        #record['timestamp'] = datetime.strptime(
            #record['timestamp'], '%d/%b/%Y:%H:%M:%S %z')
        
        loglst.append(record)

In [160]:
# Convert the list of OrderedDict to a dataframe
col = Counter()
for k in loglst:
    col.update(k)

df = pd.DataFrame([k.values() for k in loglst], columns = col.keys())
    
df.head()

Unnamed: 0,host,timestamp,request_method,request_url,reply_code,reply_bytes
0,uplherc.upl.com,01/Aug/1995:00:00:07 -0400,GET,"/ HTTP/1.0""",304,0
1,uplherc.upl.com,01/Aug/1995:23:00:07 -0400,GET,"/ HTTP/1.0""",304,0
2,uplherc.upl.com,01/Aug/1995:20:00:07 -0400,GET,"/ HTTP/1.0""",304,0
3,uplherc.upl.com,02/Aug/1995:20:00:07 -0400,GET,"/ HTTP/1.0""",304,0
4,uplherc.upl.com,02/Aug/1995:20:00:07 -0400,GET,"/ HTTP/1.0""",304,0


In [182]:
# Create date time column from string
df['DateTime'] = df['timestamp'].str.split(' ').str[0]
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%b/%Y:%H:%M:%S')

# Extract date from datetime
df['Date'] = df['DateTime'].dt.date

#df['Week'] = df['Date'].apply(lambda x: (x + pd.Timedelta(days=1)).week)

# Extract day names from datetime
df['DayName'] = df['DateTime'].dt.weekday_name

# Extract day of week from datetime
df['DayOfWeek'] = df['DateTime'].dt.dayofweek

# Extract hour from datetime
df['Hour'] = df['DateTime'].dt.hour.apply(pd.to_numeric, errors='coerce')

# Convert reply bytes to int
df['reply_bytes'] = df['reply_bytes'].apply(pd.to_numeric, errors='coerce')

# Add a dummy column for calculating sum later
df['Cnt'] = 1

# Add Index
df['Idx'] = df.index

In [183]:
# Create an hour bin to map the hour in the request to the corresponding bin
bins = pd.DataFrame({'low':[0,4,7,10,13,16,19,22],
                  'high':[3,6,9,12,15,18,21,23],
                  'name':['00:00 to 03:00',
                          '03:00 to 06:00',
                          '06:00 to 09:00',
                          '09:00 to 12:00',
                          '12:00 to 15:00',
                          '15:00 to 18:00',
                          '18:00 to 21:00',
                          '21:00 to 24:00']})

# Create mapping function
def hourmap(x):
    for row in bins.itertuples():
        if row.low <= x <= row.high:
            return row.name

# Apply the mapping to the hour
df['Time of Day'] = df.Hour.map(hourmap)

df.head()

Unnamed: 0,host,timestamp,request_method,request_url,reply_code,reply_bytes,DateTime,Date,DayName,DayOfWeek,Hour,Cnt,Idx,Time of Day
0,uplherc.upl.com,01/Aug/1995:00:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-01 00:00:07,1995-08-01,Tuesday,1,0,1,0,00:00 to 03:00
1,uplherc.upl.com,01/Aug/1995:23:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-01 23:00:07,1995-08-01,Tuesday,1,23,1,1,21:00 to 24:00
2,uplherc.upl.com,01/Aug/1995:20:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-01 20:00:07,1995-08-01,Tuesday,1,20,1,2,18:00 to 21:00
3,uplherc.upl.com,02/Aug/1995:20:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-02 20:00:07,1995-08-02,Wednesday,2,20,1,3,18:00 to 21:00
4,uplherc.upl.com,02/Aug/1995:20:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-02 20:00:07,1995-08-02,Wednesday,2,20,1,4,18:00 to 21:00


In [184]:
# Create dataframe for calculating number of requests
dfReqNo = df.groupby(['Date', 'DayName', 'Time of Day'])['Cnt'].agg('sum').reset_index(name='NoOfReq')

dfReqNo.head()

Unnamed: 0,Date,DayName,Time of Day,NoOfReq
0,1995-08-01,Tuesday,00:00 to 03:00,1
1,1995-08-01,Tuesday,18:00 to 21:00,1
2,1995-08-01,Tuesday,21:00 to 24:00,1
3,1995-08-02,Wednesday,18:00 to 21:00,2
4,1995-08-03,Thursday,18:00 to 21:00,1


In [185]:
reqnum_day = dfReqNo.groupby('DayName').agg({'NoOfReq':[np.mean,np.min,np.max]})
reqnum_day.columns = ['Requests (Mean)','Requests (Min)','Requests (Max)']
reqnum_day

Unnamed: 0_level_0,Requests (Mean),Requests (Min),Requests (Max)
DayName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Friday,1.0,1,1
Monday,1.0,1,1
Saturday,1.0,1,1
Sunday,1.0,1,1
Thursday,1.0,1,1
Tuesday,1.5,1,3
Wednesday,2.5,2,3


In [186]:
dfReqnumTime = dfReqNo.groupby('Time of Day').agg({'NoOfReq':[np.mean,np.min,np.max]})
dfReqnumTime.columns = ['Requests (Mean)','Requests (Min)','Requests (Max)']
dfReqnumTime

Unnamed: 0_level_0,Requests (Mean),Requests (Min),Requests (Max)
Time of Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00:00 to 03:00,1.0,1,1
18:00 to 21:00,1.555556,1,3
21:00 to 24:00,1.0,1,1


In [188]:
# Create dataframe for calculating data transfer volume
dfDataVol = df.groupby(['Date', 'DayName'])['reply_bytes'].agg('sum').reset_index(name='DataVolume')

dfDataVol.head()

Unnamed: 0,Date,DayName,DataVolume
0,1995-08-01,Tuesday,0
1,1995-08-02,Wednesday,0
2,1995-08-03,Thursday,0
3,1995-08-04,Friday,0
4,1995-08-05,Saturday,0


In [190]:
datavol_day = dfDataVol.groupby('DayName').agg({'DataVolume':[np.mean,np.min,np.max]})
datavol_day.columns = ['Mean Bandwidth (MB)','Min Bandwidth (MB)','Max Bandwidth (MB)']
datavol_day

Unnamed: 0_level_0,Mean Bandwidth (MB),Min Bandwidth (MB),Max Bandwidth (MB)
DayName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Friday,0,0,0
Monday,0,0,0
Saturday,0,0,0
Sunday,0,0,0
Thursday,0,0,0
Tuesday,0,0,0
Wednesday,0,0,0
