In [None]:
import base64
import subprocess
import pandas as pd
import csv
import os
import re
import sys
import subprocess

from paramiko.client import SSHClient
from paramiko.sftp_client import SFTPClient
from paramiko import AutoAddPolicy
from dateutil.parser import parse
from tqdm import tqdm

df = pd.DataFrame()

In [None]:
remote_user = 'django'
host = 'ambition-test.bhp.org.bw'
path = os.path.expanduser('/home/django/source/ambition/logs/access.log')

In [None]:
pat = (r''
       '(\d+.\d+.\d+.\d+)\s-\s-\s' #IP address
       '\[(.+)\]\s' #datetime
       '"GET\s(.+)\s\w+/.+"\s' #requested file
       '(\d+)\s' #status
       '(\d+)\s' #bandwidth
       '"(.+)"\s' #referrer
       '"(.+)"' #user agent
    )

def blocks(files, size=65536):
    while True:
        b = files.read(size)
        if not b: break
        yield b

def connect(ssh=None):
    ssh.load_system_host_keys()
    ssh.set_missing_host_key_policy(AutoAddPolicy())
    ssh.connect(
        host,
        username=remote_user,
        timeout=5,
        # banner_timeout=self.banner_timeout,
        compress=True,
    )
    return ssh

In [None]:
lines = []
from pprint import pprint
df = pd.DataFrame([], columns=['ip', 'datestring', 'url', 'status', 't', 'host', 'client'])
index = 0
with SSHClient() as ssh:
    ssh = connect(ssh=ssh)
    with SFTPClient.from_transport(ssh.get_transport()) as sftp_client:
        with sftp_client.open(path, mode='r') as f:
            total = sum(bl.decode('utf-8').count('\n') for bl in blocks(f))
            f.seek(0, 0)
            for line in tqdm(f, total=total):
                match = re.findall(pat, line)
                df_ln = pd.DataFrame(
                    list(match), columns=['ip', 'datestring', 'url', 'status', 't', 'host', 'client'])
                df = pd.concat([df, df_ln], ignore_index=True)
df.head()

In [None]:
# convert datestring to date
def to_datetime(d):
    d = d.split('/')
    d = ' '.join([d[1], d[0], d[2]]).split(':')
    d = d[0] + ' ' + ':'.join([d[1], d[2], d[3].replace('+0200', '')])
    return parse(timestr=d)

df['date'] = df.apply(lambda row: to_datetime(row['datestring']), axis=1)
df['date'] = df['date'].astype('datetime64[ns]')

# chop off querystring
df['simple_url'] = df.apply(lambda row: row['url'].split('?')[0], axis=1)

# remove junk urls
df = df[
   (-df['simple_url'].isin(['/', '/admin/jsi18n/']))
   & (-df['simple_url'].str.contains('static'))
   & (-df['simple_url'].str.contains('login'))]

# show df info
df.info()

In [None]:
# review min max date
df['date'].describe()

In [None]:
# rank users effort
from datetime import datetime, date
now = date.today()
today = datetime(now.year, now.month, now.day, 0, 0, 0)
df[df['date'] >= today].groupby('ip').size()

# remove logins, static, etc
df1 = df[(-df['simple_url'].isin(['/', '/admin/jsi18n/']))
   & (-df['simple_url'].str.contains('static'))
   & (-df['simple_url'].str.contains('login'))]

In [None]:
timestamp = datetime.today().strftime('%Y%m%d%H%M%S')
df1.to_csv(os.path.expanduser(f'~/{host.replace(".", "-")}-access-log-{timestamp}.csv'))

In [None]:
df1.groupby('ip').size()

In [None]:
# inspect top user
selected_ips = ['10.136.211.206']
df1 = df1[df1['ip'].isin(selected_ips)]

In [None]:
# add models
add_group = df1[(df1['simple_url'].str.contains('add'))
                & (-df1['simple_url'].str.contains('appointment'))
               ].groupby(['simple_url'])
add_group.size()

In [None]:
# change models
df1[(df1['simple_url'].str.contains('change'))
    & (-df1['simple_url'].str.contains('appointment'))
   ].groupby(['simple_url']).size()

In [None]:
# delete models
df1[(df1['simple_url'].str.contains('delete'))
    & (-df1['simple_url'].str.contains('appointment'))
    ].groupby(['simple_url']).size()

In [None]:
# accessing appointments
df1[(df1['simple_url'].str.contains('appointment'))].groupby(['simple_url']).size()

In [None]:
# accessing dashboards
df1[(df1['simple_url'].str.contains('dashboard'))].groupby(['simple_url']).size()

In [None]:
# accessing listboards
df1[(df1['simple_url'].str.contains('screening_listboard'))].groupby(['simple_url']).size()

In [None]:
# accessing edc modules
df1[(df1['simple_url'].str.contains('edc'))].groupby(['simple_url']).size()