In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geoip2.database
import myLogReader as mlr
import re
import os
import sys
import datetime as dt

%matplotlib inline

## Automate - Read and Prep log data into DF

In [2]:
logsPath = '../data/logs'
geoLiteIPDBPath = '../data/GeoLite2-City_20181009/GeoLite2-City.mmdb'

#Create a myLogReader object
myLogReader = mlr.log()
#Open Reader
myLogReader.openReader(geoLiteIPDBPath)

In [3]:
df =  myLogReader.readLogs(logsPath,1)

../data/logs\u_ex171211.log


In [4]:
#Close Reader
myLogReader.closeReader()

In [5]:
df.head()

Unnamed: 0,date,cs-username,client-ip,time-taken(ms),client-device,client-browser,client-webPage,client-city,client-country,date-IsWeekday,date-calendar-week,date-year
0,2017-12-11,Unknown,137.92.136.50,109,Desktop,Chrome,Security,Unknown,Australia,1,50,2017
1,2017-12-11,u3118068,137.92.136.50,156,Desktop,Chrome,Security,Unknown,Australia,1,50,2017
2,2017-12-11,u3118068,137.92.136.50,78,Desktop,Chrome,Security,Unknown,Australia,1,50,2017
3,2017-12-11,u3118068,137.92.136.50,2593,Desktop,Chrome,Security,Unknown,Australia,1,50,2017
4,2017-12-11,u3118068,137.92.136.50,46,Desktop,Chrome,Unknown,Unknown,Australia,1,50,2017


#### 2- Automate load logs and aggregate data

In [None]:
#df.to_csv('test.csv')

In [100]:
(df.groupby(by=['calendar-year-week'])
            .agg({'client-ip': pd.Series.nunique})
            .rename(columns = {'client-ip':'client-ip-unique-count'}))

(df.groupby(by=['calendar-year-week'])
            .agg({'cs-username': pd.Series.nunique})
            .rename(columns = {'cs-username':'cs-username-unique-count'}))

(df.groupby(by=['calendar-year-week'])
            .agg({'client-ip': pd.Series.count,'time-taken(ms)' : pd.Series.sum})
            .rename(columns = {'client-ip':'client-ip-unique-count','time-taken(ms)' :'time-taken(ms)-sum' }))


(df.groupby(by=['calendar-year-week','client-browser'])
            .agg({'client-browser': pd.Series.count})
            .rename(columns = {'client-browser':'client-browser-count'})      
            .reset_index(level=1)
            .pivot(columns='client-browser',values='client-browser-count')
            .rename(columns = {'Chrome':'Chrome-count'
                              ,'Firefox':'Firefox-count'
                              ,'Other':'Other-count'
                              ,'Safari':'Safari-count'}))



(df.groupby(by=['calendar-year-week','client-device'])
            .agg({'client-device': pd.Series.count})  
            .rename(columns = {'client-device':'client-device-count'})      
            .reset_index(level=1)
            .pivot(columns='client-device',values='client-device-count')
            .rename(columns = {'Desktop':'Desktop-count'
                              ,'Mobile':'Mobile-count'}))

client-device,Desktop-count,Mobile-count
calendar-year-week,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-50,20677,1661


In [83]:

(df.groupby(by=['calendar-year-week','client-browser'])
            .agg({'client-browser': pd.Series.count})
            .rename(columns = {'client-browser':'client-browser-count'})      
            .reset_index(level=1)
            .pivot(columns='client-browser',values='client-browser-count')
)
#df.head()

client-browser,Chrome,Firefox,Other,Safari
calendar-year-week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-50,9851,2889,6421,3177


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22338 entries, 0 to 22337
Data columns (total 12 columns):
date                  22338 non-null object
cs-username           22338 non-null object
client-ip             22338 non-null object
time-taken(ms)        22338 non-null int64
client-device         22338 non-null object
client-browser        22338 non-null object
client-webPage        22338 non-null object
client-city           22338 non-null object
client-country        22338 non-null object
date-IsWeekday        22338 non-null int64
date-calendar-week    22338 non-null int64
date-year             22338 non-null int64
dtypes: int64(4), object(8)
memory usage: 2.0+ MB


In [14]:
df['client-ip'].describe()

count            22338
unique             265
top       137.92.20.63
freq              2144
Name: client-ip, dtype: object

In [7]:
df.loc[:, df.dtypes == object].describe()

Unnamed: 0,date,cs-username,client-ip,client-device,client-browser,client-webPage,client-city,client-country
count,22338,22338,22338,22338,22338,22338,22338,22338
unique,1,145,265,2,4,24,75,11
top,2017-12-11,Unknown,137.92.20.63,Desktop,Chrome,Unknown,Canberra,Australia
freq,22338,6875,2144,20677,9851,6336,9449,19538


In [6]:
df.isna().sum()

date                  0
cs-username           0
client-ip             0
time-taken(ms)        0
client-device         0
client-browser        0
client-webPage        0
client-city           0
client-country        0
date-IsWeekday        0
date-calendar-week    0
date-year             0
dtype: int64