In [1]:
import base64
import subprocess
import pandas as pd
import csv
import os
import re
import sys
import subprocess

from datetime import datetime
from paramiko.client import SSHClient
from paramiko.sftp_client import SFTPClient
from paramiko import AutoAddPolicy
from pprint import pprint
from dateutil.parser import parse
from tqdm import tqdm

df = pd.DataFrame()

In [2]:
remote_user = 'django'
host = 'ambition-test.bhp.org.bw'
path = os.path.expanduser('/home/django/source/ambition/logs/access.log')
timestamp = datetime.today().strftime('%Y%m%d%H%M%S')

logfile = '~/ambition-test-bhp-org-bw-access-log-raw-20171027090111.csv'

In [3]:
pat = (r''
       '(\d+.\d+.\d+.\d+)\s-\s-\s' #IP address
       '\[(.+)\]\s' #datetime
       '"GET\s(.+)\s\w+/.+"\s' #requested file
       '(\d+)\s' #status
       '(\d+)\s' #bandwidth
       '"(.+)"\s' #referrer
       '"(.+)"' #user agent
    )

def blocks(files, size=65536):
    while True:
        b = files.read(size)
        if not b: break
        yield b

def connect(ssh=None):
    ssh.load_system_host_keys()
    ssh.set_missing_host_key_policy(AutoAddPolicy())
    ssh.connect(
        host,
        username=remote_user,
        timeout=5,
        # banner_timeout=self.banner_timeout,
        compress=True,
    )
    return ssh

In [4]:
if logfile:
    # load from csV
    df = pd.read_csv(os.path.expanduser(logfile), low_memory=True)
else:
    lines = []
    df = pd.DataFrame([], columns=['ip', 'datestring', 'url', 'status', 't', 'host', 'client'])
    index = 0
    with SSHClient() as ssh:
        ssh = connect(ssh=ssh)
        with SFTPClient.from_transport(ssh.get_transport()) as sftp_client:
            with sftp_client.open(path, mode='r') as f:
                total = sum(bl.decode('utf-8').count('\n') for bl in blocks(f))
                f.seek(0, 0)
                for line in tqdm(f, total=total):
                    match = re.findall(pat, line)
                    df_ln = pd.DataFrame(
                        list(match), columns=['ip', 'datestring', 'url', 'status', 't', 'host', 'client'])
                    df = pd.concat([df, df_ln], ignore_index=True)

In [5]:
if not logfile:
    df.to_csv(os.path.expanduser(f'~/{host.replace(".", "-")}-access-log-raw-{timestamp}.csv'))
    print(f'~/{host.replace(".", "-")}-access-log-raw-{timestamp}.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44635 entries, 0 to 44634
Data columns (total 8 columns):
Unnamed: 0    44635 non-null int64
ip            44635 non-null object
datestring    44635 non-null object
url           44635 non-null object
status        44635 non-null int64
t             44635 non-null int64
host          44635 non-null object
client        44635 non-null object
dtypes: int64(3), object(5)
memory usage: 2.7+ MB


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,ip,datestring,url,status,t,host,client
0,0,10.113.201.182,01/Jun/2017:11:00:34 +0200,/,200,3224,-,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4...
1,1,10.113.201.182,01/Jun/2017:11:00:34 +0200,/static/edc_base/js/ie10-viewport-bug-workarou...,200,694,http://ambition-test.bhp.org.bw/,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4...
2,2,10.113.201.182,01/Jun/2017:11:00:34 +0200,/static/edc_base/js/edc-base.js,200,149,http://ambition-test.bhp.org.bw/,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4...
3,3,10.113.201.182,01/Jun/2017:11:00:34 +0200,/static/django_js_reverse/js/reverse.js,404,209,http://ambition-test.bhp.org.bw/,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4...
4,4,10.113.201.182,01/Jun/2017:11:00:35 +0200,/static/django_js_reverse/js/reverse.js,404,209,http://ambition-test.bhp.org.bw/,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4...


In [8]:
# convert datestring to date
def to_datetime(d):
    d = d.split('/')
    d = ' '.join([d[1], d[0], d[2]]).split(':')
    d = d[0] + ' ' + ':'.join([d[1], d[2], d[3].replace('+0200', '')])
    return parse(timestr=d)

df['date'] = df.apply(lambda row: to_datetime(row['datestring']), axis=1)
df['date'] = df['date'].astype('datetime64[ns]')

# chop off querystring
df['simple_url'] = df.apply(lambda row: row['url'].split('?')[0], axis=1)

# remove junk urls
df = df[
   (-df['simple_url'].isin(['/', '/admin/jsi18n/']))
   & (-df['simple_url'].str.contains('static'))
   & (-df['simple_url'].str.contains('login'))]

# show df info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12371 entries, 7 to 44613
Data columns (total 10 columns):
Unnamed: 0    12371 non-null int64
ip            12371 non-null object
datestring    12371 non-null object
url           12371 non-null object
status        12371 non-null int64
t             12371 non-null int64
host          12371 non-null object
client        12371 non-null object
date          12371 non-null datetime64[ns]
simple_url    12371 non-null object
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 1.0+ MB


In [9]:
# review min max date
df['date'].describe()

count                   12371
unique                  11941
top       2017-09-08 09:51:25
freq                        5
first     2017-06-01 11:00:38
last      2017-10-27 10:42:53
Name: date, dtype: object

In [10]:
# rank users effort
from datetime import datetime, date
now = date.today()
today = datetime(now.year, now.month, now.day, 0, 0, 0)
df[df['date'] >= today].groupby('ip').size()

# remove logins, static, etc
df1 = df[(-df['simple_url'].isin(['/', '/admin/jsi18n/']))
   & (-df['simple_url'].str.contains('static'))
   & (-df['simple_url'].str.contains('login'))]

In [11]:
startdate = datetime(2017, 10, 10)
df1['date'] = df1['date'].dt.normalize()
df1 = df1[df1['date'] >= startdate]

In [12]:
# timestamp = datetime.today().strftime('%Y%m%d%H%M%S')
# df1.to_csv(os.path.expanduser(f'~/{host.replace(".", "-")}-access-log-{timestamp}.csv'))

### Activity in hits per IP

In [13]:
from dateutil.relativedelta import relativedelta

grouped = df1.groupby(['date'])
for date, _ in grouped.groups.items():
    print(date)
    print(df1[
        (df1['date'] >= date)
        & (df1['date'] < (date + relativedelta(days=1)))].groupby('ip').size())
# print(today - relativedelta(days=2))
# print(df1[
#     (df1['date'] >= today - relativedelta(days=2))
#     & (df1['date'] < today - relativedelta(days=1))].groupby('ip').size())
# print(today - relativedelta(days=1))
# print(df1[
#     (df1['date'] >= today - relativedelta(days=1))
#     & (df1['date'] < today)].groupby('ip').size())
# print(today)
# print(df1[df1['date'] >= today].groupby('ip').size())

2017-10-10 00:00:00
ip
10.113.201.72    3
172.31.6.51      1
dtype: int64
2017-10-11 00:00:00
ip
10.113.201.114    11
10.113.201.204    10
10.136.211.206    97
172.31.6.51       69
dtype: int64
2017-10-12 00:00:00
ip
10.113.201.159     3
10.113.201.186     5
10.113.201.2      17
10.113.201.233     4
10.136.211.206     1
10.136.211.70      8
dtype: int64
2017-10-13 00:00:00
ip
10.113.201.136      2
10.113.201.189     23
10.113.201.196      9
10.136.211.206    123
dtype: int64
2017-10-14 00:00:00
ip
10.113.200.115    1
dtype: int64
2017-10-16 00:00:00
ip
10.113.201.102     2
10.136.211.74      6
10.136.211.78     71
dtype: int64
2017-10-17 00:00:00
ip
10.113.201.118    4
10.113.201.124    5
dtype: int64
2017-10-18 00:00:00
ip
10.113.201.107     13
10.113.201.180     20
10.113.201.209     48
10.113.201.249      6
10.136.211.78     140
172.31.6.51         1
dtype: int64
2017-10-19 00:00:00
ip
10.113.201.185      5
10.113.201.228      9
10.136.211.18      13
10.136.211.74       1
154.70.150

In [14]:
# inspect top user
# selected_ips = ['10.136.211.78']
# df1 = df1[df1['ip'].isin(selected_ips)]

### Type of activity

#### Add

In [15]:
# add models
add_group = df1[(df1['simple_url'].str.contains('add'))
                & (-df1['simple_url'].str.contains('appointment'))
               ].groupby(['simple_url'])
add_group.size()

simple_url
/admin/ambition_subject/adverseevent/add/                                            11
/admin/ambition_subject/adverseeventfollowup/add/                                     5
/admin/ambition_subject/bloodresult/add/                                             37
/admin/ambition_subject/clinicnote/add/                                              35
/admin/ambition_subject/healtheconomicsquestionnaire/add/                            18
/admin/ambition_subject/lumbarpuncturecsf/add/                                       18
/admin/ambition_subject/microbiology/add/                                             6
/admin/ambition_subject/patienthistory/add/                                          23
/admin/ambition_subject/prnmodel/add/                                                29
/admin/ambition_subject/protocoldeviationviolation/add/                               2
/admin/ambition_subject/radiology/add/                                                2
/admin/ambition_subje

#### Edit / Change

In [16]:
# change models
df1[(df1['simple_url'].str.contains('change'))
    & (-df1['simple_url'].str.contains('appointment'))
   ].groupby(['simple_url']).size()

simple_url
/admin/ambition_subject/adverseevent/2b8c0747-0df3-4e11-91f7-600b2093388d/change/                    1
/admin/ambition_subject/adverseevent/aa54fddb-3716-4927-934f-23af141650d7/change/                    4
/admin/ambition_subject/adverseevent/d178a5ed-5206-4113-be0d-afd96f273272/change/                    1
/admin/ambition_subject/adverseevent/d1ed3265-aee0-4d28-a9e7-c348b1f44279/change/                    1
/admin/ambition_subject/adverseevent/f7f5caad-7240-41e3-903b-c86a77c90b15/change/                    3
/admin/ambition_subject/bloodresult/0b33b1c3-71fc-4550-b24d-a65e92306bf1/change/                     1
/admin/ambition_subject/bloodresult/587b55df-b24f-4386-8234-8dffd90f7833/change/                     1
/admin/ambition_subject/bloodresult/90200c10-ea95-4855-8a63-3e5b1647704b/change/                     1
/admin/ambition_subject/bloodresult/9c5e3d2c-aaab-427a-a74e-be5dc4124102/change/                     2
/admin/ambition_subject/bloodresult/ed221d82-b5a0-4195-a92c-50

#### Delete

In [17]:
# delete models
df1[(df1['simple_url'].str.contains('delete'))
    & (-df1['simple_url'].str.contains('appointment'))
    ].groupby(['simple_url']).size()

Series([], dtype: int64)

####  Appointments

In [18]:
# accessing appointments
df1[(df1['simple_url'].str.contains('appointment'))].groupby(['simple_url']).size()

simple_url
/admin/ambition_subject/appointment/                                                 1
/admin/ambition_subject/appointment/0275d7dc-35fe-4695-9253-be7f500eb2d5/change/     2
/admin/ambition_subject/appointment/10daba8a-4bbd-4d43-9ff3-588a9136342a/change/     9
/admin/ambition_subject/appointment/15a09c35-ae27-4776-9e93-dec9255256c4/change/     2
/admin/ambition_subject/appointment/17bc94bb-a2d1-4ed0-96e9-a18d26c25803/change/     4
/admin/ambition_subject/appointment/19bf1546-306c-43cc-81f5-32e9a02a2b04/change/     2
/admin/ambition_subject/appointment/1a1ebf6d-de94-4cf0-bfed-4daeb3779802/change/     3
/admin/ambition_subject/appointment/1b342b0e-1e8e-4d40-a9f9-7869afeb252a/change/     2
/admin/ambition_subject/appointment/1caabaeb-d636-49b2-ac9f-d46050a2a0b1/change/     1
/admin/ambition_subject/appointment/207fb144-0e89-416a-91ef-4065f997a1bd/change/     2
/admin/ambition_subject/appointment/21f42dca-4e15-4509-bc0d-515aec543012/change/     2
/admin/ambition_subject/appointm

#### Dashboard

In [19]:
# accessing dashboards
df1[(df1['simple_url'].str.contains('dashboard'))
   & (-df1['simple_url'].str.contains('_dashboard'))].groupby(['simple_url']).size()

simple_url
/subject/dashboard/092-40990001-3/                                          1
/subject/dashboard/092-40990002-1/                                          3
/subject/dashboard/092-40990002-1/a6b92497-0954-4872-9561-bdd561055dfe/     1
/subject/dashboard/092-40990004-7/                                          2
/subject/dashboard/092-40990004-7/86962bc1-7d04-4290-a4ff-9e9929877c3e/     2
/subject/dashboard/092-40990008-8/                                         62
/subject/dashboard/092-40990008-8/10daba8a-4bbd-4d43-9ff3-588a9136342a/    15
/subject/dashboard/092-40990008-8/56d119c6-a967-4bee-946b-2e56c905e905/     4
/subject/dashboard/092-40990008-8/5d612563-ffad-47b7-a378-557e566e9550/    11
/subject/dashboard/092-40990008-8/a22eb543-9ca1-4bce-9a3f-b568764803c0/     6
/subject/dashboard/092-40990009-6/                                          1
/subject/dashboard/092-40990009-6/049c72f7-b106-4714-ae57-484cd53b5dad/     1
/subject/dashboard/092-40990010-4/                   

#### Screening Listboards

In [20]:
# accessing listboards
df1[(df1['simple_url'].str.contains('screening_listboard'))].groupby(['simple_url']).size()

simple_url
/subject/screening_listboard/             97
/subject/screening_listboard/1/            1
/subject/screening_listboard/2/            4
/subject/screening_listboard/3/            2
/subject/screening_listboard/S993FX87/     3
/subject/screening_listboard/S9943ER9/     1
/subject/screening_listboard/S99A9AKP/     1
/subject/screening_listboard/S99EEVZK/     2
/subject/screening_listboard/S99EXAUZ/     4
/subject/screening_listboard/S99WMDW9/     3
dtype: int64

#### Pharmacy

In [21]:
df1[(df1['simple_url'].str.contains('edc_pharma'))].groupby(['simple_url']).size()

simple_url
/edc_pharma_dashboard/                                                6
/edc_pharma_dashboard/listboard/dispense/                            10
/edc_pharma_dashboard/listboard/dispensetimepoint/                    1
/edc_pharma_dashboard/listboard/dispensetimepoint/092-40990007-0/     2
/edc_pharma_dashboard/listboard/dispensetimepoint/092-40990010-4/     2
/edc_pharma_dashboard/listboard/dispensetimepoint/092-40990011-2/     2
dtype: int64

#### Lab

In [22]:
df1[(df1['simple_url'].str.contains('edc_lab'))].groupby(['simple_url']).size()

simple_url
/edc_lab_dashboard//                         9
/edc_lab_dashboard/listboard/aliquot/        1
/edc_lab_dashboard/listboard/manifest/       1
/edc_lab_dashboard/listboard/pack/           1
/edc_lab_dashboard/listboard/process/        2
/edc_lab_dashboard/listboard/receive/        7
/edc_lab_dashboard/listboard/requisition/    1
dtype: int64

#### Edc

In [23]:
# accessing edc modules
df1[(df1['simple_url'].str.contains('edc'))].groupby(['simple_url']).size()

simple_url
/edc/settings/                                                        1
/edc_consent/                                                         5
/edc_consent/admin/                                                   3
/edc_device/                                                          2
/edc_identifier/admin/                                                1
/edc_lab_dashboard//                                                  9
/edc_lab_dashboard/listboard/aliquot/                                 1
/edc_lab_dashboard/listboard/manifest/                                1
/edc_lab_dashboard/listboard/pack/                                    1
/edc_lab_dashboard/listboard/process/                                 2
/edc_lab_dashboard/listboard/receive/                                 7
/edc_lab_dashboard/listboard/requisition/                             1
/edc_pharma_dashboard/                                                6
/edc_pharma_dashboard/listboard/dispense/            