In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geoip2.database
import myLogReader as mlr
import pythonClassExample as pce
import re
import os
import datetime as dt

%matplotlib inline

## Automate - Read and Prep log data into DF

#### 1-Define functions

In [None]:
def getListOfFiles(dirName):
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    
    for file in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, file)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
    return allFiles

def readLog(file):
    log_df = pd.read_csv(file
            #,skiprows=[0,1,2,3]
            , comment='#'
            , sep=' ' 
            , usecols=[0,1, 2, 5, 6, 7, 8, 9, 10,11,12,14]
            , na_values='-'
            , names=['date'
                    ,'time'
                    ,'server-ip'
                    ,'cs-uri-query'
                    ,'server-port'
                    ,'cs-username'
                    ,'client-ip'
                    ,'cs(User-Agent)'
                    ,'cs(Referer)'
                    ,'sc-status'
                    ,'sc-substatus'
                   ,'time-taken(ms)'])
    return log_df

def getDevice (UserAgentResponse):
    device ='Other' 
    if 'Mobi' in UserAgentResponse:
        device = 'Mobile'
    else:
        device = 'Desktop'
    return device

def getBrowser (UserAgentResponse):
    browser ='Other' 
    if 'Firefox' in UserAgentResponse and 'Seamonkey' not in UserAgentResponse:
        browser = 'Firefox'
    elif 'Seamonkey' in UserAgentResponse:
        browser = 'Seamonkey'
    elif 'Chrome' in UserAgentResponse and 'Chromium' not in UserAgentResponse:
        browser = 'Chrome'
    elif ('Safari' in UserAgentResponse and 'Chromium' not in UserAgentResponse and 'Chrome' not in UserAgentResponse):
        browser = 'Safari'
    elif 'OPR' in UserAgentResponse and 'Opera'  in UserAgentResponse:
        browser = 'Opera'
    elif '; MSIE' in UserAgentResponse:
        browser = 'IE'
    return browser

def GetWebPageSection(x):
    section = 'Unknown'
    r = re.compile('([A-Z])\w+')
    section = r.search(x)
    if section is not None:
        return section.group()
    return section

def deriveClientDevice(iis_log_df):
    iis_log_df['client-device']  =  iis_log_df['cs(User-Agent)'].apply(lambda x: getDevice(str(x)))
    return iis_log_df

def deriveClientBrowser(iis_log_df):
    iis_log_df['client-browser'] =  iis_log_df['cs(User-Agent)'].apply(lambda x: getBrowser(str(x)))
    return iis_log_df

def deriveClientWebPage(iis_log_df):
    iis_log_df['client-webPage'] = iis_log_df['cs(Referer)'].apply(lambda x: GetWebPageSection(x) if type(x) != float else np.nan)
    return iis_log_df

def deriveClientCity(iis_log_df):
    #print(iis_log_df)
    iis_log_df['client-city'] =  iis_log_df['client-ip'].apply(lambda x: reader.city(ip_address=x).city.name if reader.city(ip_address=x).city.name != None else np.nan)
    return iis_log_df
    
def deriveClientCountry(iis_log_df):    
    iis_log_df['client-country'] =  iis_log_df['client-ip'].apply(lambda x: reader.city(ip_address=x).country.name if reader.city(ip_address=x).country.name != None else np.nan)
    return iis_log_df

#### Define function to loop through logs and load the data into df

In [None]:
def readLogs(logsPath,geoLiteIPDBPath):
    df = pd.DataFrame()
    #instantiate log reader
    reader = geoip2.database.Reader(geoLiteIPDBPath)
    
    listOfFiles = getListOfFiles(logsPath)
    
    try:    
        for file in listOfFiles:
            #print (file)
            log_df =  readLog(file)
            log_df = deriveClientCity(log_df)
            log_df = deriveClientCountry(log_df)
            log_df = deriveClientDevice(log_df)
            log_df = deriveClientBrowser(log_df)
            log_df = deriveClientWebPage(log_df)
            log_df = deriveClientCity(log_df)
            log_df = deriveClientCountry(log_df)
            df = pd.concat([df,log_df])
            os.rename(file,'../data/success/' + file[file.find('u'):])
    except Exception:
        os.rename(file,'../data/error/' + file[file.find('u'):])
        print('Moving file '+ file + ' to ../data/error/')
            
    finally:        
        reader.close()
        return df

#### Load all logs into DF

In [None]:
logsPath = '../data/logs'
geoLiteIPDBPath = '../data/GeoLite2-City_20181009/GeoLite2-City.mmdb'

df =  readLogs(logsPath,geoLiteIPDBPath)


'''df = deriveCityFromIP(df)

df = deriveCountryFromIP(df)

df = deriveClientDevice(df)
df = deriveClientBrowser(df)
df = deriveClientWebPage(df)

df = deriveClientCity(df)
df = deriveClientCountry(df)'''

#### 2- Automate load logs and aggregate data

In [None]:
#df.to_csv('test.csv')

In [None]:
client_browser_df = (df.groupby(by=['date','client-browser'])['client-browser']
                        .count()
                        .reset_index(level=1,name='Count')
                        .pivot(columns='client-browser',values='Count'))

client_city_df = (df.groupby(by=['date','client-city'])['client-city']
            .count()
            .reset_index(level=1,name='Count')
            .pivot(columns='client-city',values='Count'))

client_country_df = (df.groupby(by=['date','client-country'])['client-country']
            .count()
            .reset_index(level=1,name='Count')
            .pivot(columns='client-country',values='Count'))

client_device_df = (df.groupby(by=['date','client-device'])['client-device']
            .count()
            .reset_index(level=1,name='Count')
            .pivot(columns='client-device',values='Count'))

client_webPage_df = (df.groupby(by=['date','client-webPage'])['client-webPage']
            .count()
            .reset_index(level=1,name='Count')
            .pivot(columns='client-webPage',values='Count'))

(df.groupby(by=['date','client-ip'])['client
                                     -ip']
            .count()
            .reset_index(level=1,name='Count')
            .pivot(columns='client-ip',values='Count'))

In [None]:
df.iloc[10:20,[0,5,6,11,12,13,14,15,16]]

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df["server-port"].describe()

In [None]:
df.info()

## Derive Calendar Week, year after aggregating the data

In [None]:
df['date-weekday']   = df['date'].apply(lambda x: 'Weekday' if np.int8(str(dt.datetime.strptime(x,'%Y-%m-%d').weekday())) < 5 else 'Weekend')
df['date-calendar-week']   = df['date'].apply(lambda x: np.int8(str(dt.datetime.strptime(x,'%Y-%m-%d').isocalendar()[1])))
df['date-year']   = df['date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d').year)

In [None]:
df.loc[:, df.dtypes == object].describe()

In [None]:
import datetime as dt
d =dt.datetime.strptime(df.date.loc[1],'%Y-%m-%d')
d =dt.datetime.strptime('2018-12-28','%Y-%m-%d')
d.year
#np.int8(str(d.year))

#iis_logs_df.date.loc[1]

In [None]:
df.isna().sum()

In [2]:
myLogReader = mlr.log()

In [3]:
myLogReader.readLog('../data/logs/u_ex171207.log')

Unnamed: 0,date,time,server-ip,cs-uri-query,server-port,cs-username,client-ip,cs(User-Agent),cs(Referer),sc-status,sc-substatus,time-taken(ms)
0,2017-12-07,23:11:17,192.168.2.210,,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,,302,0,75625
1,2017-12-07,23:11:20,192.168.2.210,ReturnUrl=%2f,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,,302,0,3031
2,2017-12-07,23:11:40,192.168.2.210,ReturnUrl=%2f,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,,200,0,20531
3,2017-12-07,23:11:40,192.168.2.210,v=ZLF68Gqwmuuh2ZvHcQpEuU1xkWptxwOpRzXwwMGaiN01,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,https://canberra-prodapp1.inplace.com.au/Secur...,200,0,93
4,2017-12-07,23:11:40,192.168.2.210,v=84kf7GIBdMhYIINVmeCDSHFVxrD5iToJR-sVWXPTNo81,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,https://canberra-prodapp1.inplace.com.au/Secur...,200,0,31
5,2017-12-07,23:11:40,192.168.2.210,v=HLAyqmEQO19pUhoRngq_PTn9d4BpoFSmCYWSu40JObo1,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,https://canberra-prodapp1.inplace.com.au/Secur...,200,0,171
6,2017-12-07,23:11:40,192.168.2.210,v=nYJXCCaDzTgv1vGOoG3pxVHR1NWshvyF85Ho-L2HI9M1,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,https://canberra-prodapp1.inplace.com.au/Secur...,200,0,46
7,2017-12-07,23:11:40,192.168.2.210,v=pIhLJRZ-i9xD3brYZ6I5pZD6973WqBDt6kYnAg5j20E1,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,https://canberra-prodapp1.inplace.com.au/Secur...,200,0,218
8,2017-12-07,23:11:40,192.168.2.210,v=K2ZmbS9E15TT5COZ4IfIUI6JZwDWUyRvRjAsNl7AB5I1,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,https://canberra-prodapp1.inplace.com.au/Secur...,200,0,93
9,2017-12-07,23:11:40,192.168.2.210,v=PF4-dJ7eKZCJeq2KqIqSW-VJl0R9Y3bOx5C2J1wyGNg1,443,,144.139.133.243,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64;+rv:5...,https://canberra-prodapp1.inplace.com.au/Secur...,200,0,187
