In [1]:
# Import required libraries
import re
import os
from datetime import datetime
from collections import OrderedDict, Counter
import pandas as pd
from pandas.compat import StringIO
import numpy as np

# Define the input log file
file = 'data/nasa-http/NASA_access_log_Aug95 - Copy.log'

# Create regular expression to parse the web log file
log_line_regex = re.compile(''.join([
    r'^(?P<host>[\S]+)\s-\s-\s', r'\[(?P<timestamp>.{26})\]',
    r'\s"(?P<request_method>[A-Z]{3,4})\s(?P<request_url>.{1,100})(\sHTTP/1.0")?',
    r'\s(?P<reply_code>[0-9]{3})\s(?P<reply_bytes>[0-9-]{1,20})$'
]))

In [2]:
  
# Create empty dataframe
nasa = pd.DataFrame([])
loglst = list()

# Set count to loop through lines in the file
cnt = 0
with open(file) as fl:
    for line in fl:
        m = log_line_regex.match(line)
        record = OrderedDict([
            (key, value) 
            for key, value in m.groupdict().items()
        ])

        #record['timestamp'] = datetime.strptime(
            #record['timestamp'], '%d/%b/%Y:%H:%M:%S %z')
        
        loglst.append(record)

In [6]:
# Convert the list of OrderedDict to a dataframe
col = Counter()
for k in loglst:
    col.update(k)

df = pd.DataFrame([k.values() for k in loglst], columns = col.keys())
    
df.head()

Unnamed: 0,host,timestamp,request_method,request_url,reply_code,reply_bytes
0,uplherc.upl.com,01/Aug/1995:00:00:07 -0400,GET,"/ HTTP/1.0""",304,0
1,uplherc.upl.com,10/Aug/1995:23:00:07 -0400,GET,"/ HTTP/1.0""",304,0


In [15]:
# Create date time column from string
df['DateTime'] = df['timestamp'].str.split(' ').str[0]
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%b/%Y:%H:%M:%S')

# Extract day of week from datetime
df['DayName'] = df['DateTime'].dt.weekday_name

# Extract hour from datetime
df['Hour'] = df['DateTime'].dt.hour.apply(pd.to_numeric, errors='coerce')

bins = [0,7]
labels = [1]
df['Time of Day'] = pd.cut(df['Hour'], bins=bins, labels=labels)

#df['Time of Day'] = pd.cut(df['Hour'], cut_points, labels)

In [16]:
df.head()

Unnamed: 0,host,timestamp,request_method,request_url,reply_code,reply_bytes,DateTime,DayName,Hour,Time of Day
0,uplherc.upl.com,01/Aug/1995:00:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-01 00:00:07,Tuesday,0,
1,uplherc.upl.com,10/Aug/1995:23:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-10 23:00:07,Thursday,23,


In [22]:
df["a"]= pd.cut(df.Hour, [0, 3, 6], labels=(1,0))

In [23]:
df

Unnamed: 0,host,timestamp,request_method,request_url,reply_code,reply_bytes,DateTime,DayName,Hour,Time of Day,a
0,uplherc.upl.com,01/Aug/1995:00:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-01 00:00:07,Tuesday,0,,
1,uplherc.upl.com,10/Aug/1995:23:00:07 -0400,GET,"/ HTTP/1.0""",304,0,1995-08-10 23:00:07,Thursday,23,,
