In [2]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geoip2.database
import myLogReader as mlr
import re
import os
import sys
import datetime as dt

%matplotlib inline

## Automate - Read and Prep log data into DF

In [3]:
logsPath = '../data/logs'
geoLiteIPDBPath = '../data/GeoLite2-City_20181009/GeoLite2-City.mmdb'

#Create a myLogReader object
myLogReader = mlr.log()
#Open Reader
myLogReader.openReader(geoLiteIPDBPath)

In [None]:
df =  myLogReader.readLogs(logsPath,112)

In [None]:
#Close Reader
myLogReader.closeReader()

In [None]:
df.head()

#### 2- Automate load logs and aggregate data

In [None]:
df.to_csv('../data/test.csv')

In [None]:
df_uniqueIP=(df.groupby(by=['calendar-year-week'])
            .agg({'client-ip': pd.Series.nunique})
            .rename(columns = {'client-ip':'client-ip-unique-count'}))

df_uniqueUsername=(df.groupby(by=['calendar-year-week'])
            .agg({'cs-username': pd.Series.nunique})
            .rename(columns = {'cs-username':'cs-username-unique-count'}))

df_totalconnections_timetaken= (df.groupby(by=['calendar-year-week'])
            .agg({'client-ip': pd.Series.count,'time-taken(ms)' : pd.Series.sum})
            .rename(columns = {'client-ip':'client-connections-count','time-taken(ms)' :'time-taken(ms)-sum' }))


df_browsercount=(df.groupby(by=['calendar-year-week','client-browser'])
            .agg({'client-browser': pd.Series.count})
            .rename(columns = {'client-browser':'client-browser-count'})      
            .reset_index(level=1)
            .pivot(columns='client-browser',values='client-browser-count')
            .rename(columns = {'Chrome':'Chrome-count'
                              ,'Firefox':'Firefox-count'
                              ,'Other':'Other-count'
                              ,'Safari':'Safari-count'}))



df_devicecount=(df.groupby(by=['calendar-year-week','client-device'])
            .agg({'client-device': pd.Series.count})  
            .rename(columns = {'client-device':'client-device-count'})      
            .reset_index(level=1)
            .pivot(columns='client-device',values='client-device-count')
            .rename(columns = {'Desktop':'Desktop-count'
                              ,'Mobile':'Mobile-count'}))

In [None]:
pd.concat([df_devicecount,df_browsercount,df_totalconnections_timetaken,df_uniqueUsername,df_uniqueIP],axis=1)

In [None]:
df.info()

In [None]:
df['client-ip'].describe()

In [None]:
df.loc[:, df.dtypes == object].describe()

In [None]:
df.isna().sum()

In [4]:
test_df =  myLogReader.readLog('../data/logs/u_ex180131.log')

myLogReader.deriveClientCountry(test_df)

Unnamed: 0,date,time,server-ip,cs-uri-query,server-port,cs-username,client-ip,cs(User-Agent),cs(Referer),sc-status,sc-substatus,time-taken(ms),client-country
0,2018-01-31,00:00:22,192.168.2.210,,443,,144.139.133.243,,,200,0,62,Australia
1,2018-01-31,00:00:22,192.168.2.210,,443,,144.139.133.243,,,200,0,46,Australia
2,2018-01-31,00:00:22,192.168.2.210,,443,,144.139.133.243,,,200,0,62,Australia
3,2018-01-31,00:00:23,192.168.2.210,,443,,144.139.133.243,,,200,0,78,Australia
4,2018-01-31,00:00:23,192.168.2.210,,443,s430568,137.92.20.72,Mozilla/5.0+(Windows+NT+10.0;+WOW64)+AppleWebK...,https://inplace.canberra.edu.au/Placement/Plac...,200,0,968,Australia
5,2018-01-31,00:00:23,192.168.2.210,,443,s430568,137.92.20.72,Mozilla/5.0+(Windows+NT+10.0;+WOW64)+AppleWebK...,https://inplace.canberra.edu.au/Placement/Plac...,304,0,62,Australia
6,2018-01-31,00:00:24,192.168.2.210,,443,s430568,137.92.20.72,Mozilla/5.0+(Windows+NT+10.0;+WOW64)+AppleWebK...,https://inplace.canberra.edu.au/Content/Styles...,304,0,31,Australia
7,2018-01-31,00:00:24,192.168.2.210,,443,s430568,137.92.20.72,Mozilla/5.0+(Windows+NT+10.0;+WOW64)+AppleWebK...,https://inplace.canberra.edu.au/Placement/Plac...,200,0,15,Australia
8,2018-01-31,00:00:25,192.168.2.210,ReturnUrl=%2FMaintenance%2FStudent.mvc%2FDetai...,443,,124.171.223.118,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64)+Appl...,https://inplace.canberra.edu.au/Shared/Error.m...,403,0,203,Australia
9,2018-01-31,00:00:27,192.168.2.210,,443,s430568,137.92.20.72,Mozilla/5.0+(Windows+NT+10.0;+WOW64)+AppleWebK...,https://inplace.canberra.edu.au/Placement/Plac...,200,0,2921,Australia


In [None]:
#df_ip = groupbyCalendarYearWeek_ClientIp(df)
df_ip