# analysis of /var/log/auth.log 

data collected April 6, 2020 to April 25, 2020 on a DigitalOcean droplet running Ubuntu 18.04



In [162]:
import pandas
import re # https://docs.python.org/3/library/re.html
from collections import Counter
import ipaddress # https://docs.python.org/3/library/ipaddress.html
import time
import sys

In [77]:
with open('auth.log') as f:
    lines = f.readlines()

In [78]:
lines[0]

'Apr  6 18:58:00 django-s-1vcpu-1gb-sfo2-01 systemd-logind[854]: Watching system buttons on /dev/input/event0 (Power Button)\n'

## convert list of lines into a list of dicts suitable for a Pandas dataframe

In [79]:
list_of_dicts = []
for indx, line in enumerate(lines):
    line_as_dict = {}

    # https://security.stackexchange.com/questions/18207/security-of-log-files-injecting-malicious-code-in-log-files/18209
    line = line.replace('\x00','') # there was a bad character on `lines[6615]`

#    line_as_dict['month'] = line[0:4].strip()
#    line_as_dict['day'] = line[4:6].strip()
#    line_as_dict['hour'] = line[6:9].strip()
#    line_as_dict['minute'] = line[10:12].strip()
#    line_as_dict['second'] = line[13:15].strip()
    line_as_dict['date time'] = line[0:15]
    line_as_list = line[16:].strip().split(' ')

#    if 'Apr' in line_as_list[0]:
#        print(indx)
#        print(line_as_dict)
#        print(line)
    line_as_dict['server name'] = line_as_list[0].strip()
    if '[' in line_as_list[1]: # https://serverfault.com/a/526151
        line_as_dict['service name'] = line_as_list[1].split('[')[0].strip()
        line_as_dict['pid'] = line_as_list[1].split('[')[1].replace(']:','').strip()
    else:
        line_as_dict['service name'] = line_as_list[1].strip()
    line_as_dict['message'] = ' '.join(line_as_list[2:])
    list_of_dicts.append(line_as_dict)

In [81]:
df = pandas.DataFrame(list_of_dicts)
df.shape

(9440, 5)

convert `date time` string to Python datetime

In [94]:
# https://strftime.org/
df['datetime'] = pandas.to_datetime('2020 ' + df['date time'], format='%Y %b %d %H:%M:%S')
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
df.drop('date time', inplace=True, axis=1)

In [95]:
df.head()

Unnamed: 0,message,pid,server name,service name,datetime
0,Watching system buttons on /dev/input/event0 (...,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 18:58:00
1,Watching system buttons on /dev/input/event1 (...,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 18:58:00
2,New seat seat0.,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 18:58:00
3,Server listening on 0.0.0.0 port 22.,1062,django-s-1vcpu-1gb-sfo2-01,sshd,2020-04-06 18:58:01
4,Server listening on :: port 22.,1062,django-s-1vcpu-1gb-sfo2-01,sshd,2020-04-06 18:58:01


check that the columns seem reasonable

In [96]:
df['server name'].value_counts()

django-s-1vcpu-1gb-sfo2-01                           6615
pythonflask-quickstart-ubuntu-s-1vcpu-1gb-nyc3-01    2825
Name: server name, dtype: int64

In [140]:
df['service name'].value_counts()

sshd                         8565
CRON                          714
sudo:                          83
systemd-logind                 25
polkitd(authority=local):      18
su                             10
systemd:                        7
groupadd                        6
usermod                         4
useradd                         3
polkit-agent-helper-1           2
passwd                          1
chpasswd                        1
chfn                            1
Name: service name, dtype: int64

## pid = process ID

one pid per login attempt; recycling of pids occurs

In [97]:
df['pid'].nunique()

2717

In [99]:
df['pid'].value_counts().head()

886     12
870     10
3730    10
1717     8
1709     8
Name: pid, dtype: int64

In [100]:
df['pid'].value_counts().tail()

18171    1
20819    1
21733    1
20135    1
21098    1
Name: pid, dtype: int64

In [98]:
# https://realpython.com/pandas-groupby/
# https://stackoverflow.com/questions/14734533/how-to-access-pandas-groupby-dataframe-by-key

df.groupby('pid').get_group('854').sort_values(["datetime"])

Unnamed: 0,message,pid,server name,service name,datetime
0,Watching system buttons on /dev/input/event0 (...,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 18:58:00
1,Watching system buttons on /dev/input/event1 (...,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 18:58:00
2,New seat seat0.,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 18:58:00
24,New session 1 of user root.,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 18:59:15
71,Watching system buttons on /dev/input/event0 (...,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 19:01:56
72,Watching system buttons on /dev/input/event1 (...,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 19:01:56
266,New session 5 of user root.,854,django-s-1vcpu-1gb-sfo2-01,systemd-logind,2020-04-06 19:17:56


In [101]:
df.groupby('pid').get_group('18171').sort_values(["datetime"])

Unnamed: 0,message,pid,server name,service name,datetime
8314,Connection closed by 218.89.250.77 port 46748 ...,18171,pythonflask-quickstart-ubuntu-s-1vcpu-1gb-nyc3-01,sshd,2020-04-25 03:49:50


In [103]:
df.loc[8314]['message']

'Connection closed by 218.89.250.77 port 46748 [preauth]'

## look for IP addresses in the messages

In [127]:
list_of_ips = []
for indx, msg in df['message'].items():
    # https://www.regular-expressions.info/ip.html
    res = re.findall('(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.'+
                     '(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.'+
                     '(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.'+
                     '(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])',msg)
    if res:
        if len(res)>1:
            print(res)
        else:
            list_of_ips.append('.'.join(res[0]))
    

## sort by number of times the IP address appears in the logs

In [131]:
# https://stackoverflow.com/a/5829377/1164295
d = Counter(list_of_ips)

In [132]:
# https://stackoverflow.com/a/55292813/1164295
{k: d[k] for k in sorted(d, key=d.get, reverse=True)}

{'218.89.250.77': 246,
 '165.227.30.198': 187,
 '34.74.196.104': 148,
 '139.59.58.115': 148,
 '61.187.53.119': 147,
 '129.226.161.114': 146,
 '121.122.40.109': 146,
 '183.129.141.30': 146,
 '201.57.40.70': 146,
 '152.136.36.250': 146,
 '103.16.223.243': 145,
 '106.12.155.45': 145,
 '132.145.242.238': 145,
 '124.88.37.161': 145,
 '110.49.142.46': 144,
 '115.42.127.133': 144,
 '198.199.122.234': 142,
 '157.230.208.92': 141,
 '189.4.28.99': 140,
 '58.150.46.6': 139,
 '106.13.4.250': 138,
 '118.24.9.152': 136,
 '37.49.226.19': 136,
 '106.124.131.194': 135,
 '209.97.133.196': 132,
 '118.24.2.218': 130,
 '177.152.16.45': 124,
 '103.26.40.145': 122,
 '138.68.106.62': 122,
 '207.180.244.29': 122,
 '148.70.32.179': 115,
 '45.120.69.97': 113,
 '111.229.167.10': 101,
 '141.98.81.38': 100,
 '193.70.37.133': 100,
 '51.15.99.106': 97,
 '198.199.112.219': 96,
 '40.83.125.50': 85,
 '36.155.113.40': 84,
 '182.75.216.74': 82,
 '49.231.182.35': 77,
 '125.227.255.79': 73,
 '222.186.31.166': 71,
 '195.70.5

## sort by IP address

In [139]:
# https://www.python4networkengineers.com/posts/how_to_sort_ip_addresses_with_python/
for ip in sorted(list(d.keys()), key = lambda ip: ( int(ip.split(".")[0]), 
                                    int(ip.split(".")[1]), 
                                    int(ip.split(".")[2]), 
                                    int(ip.split(".")[3]))):
    print('"' + ip + '": ' + str(d[ip]) + ',')

"0.0.0.0": 6,
"1.10.141.248": 4,
"1.10.141.254": 10,
"1.251.0.135": 4,
"8.36.123.218": 1,
"14.98.4.82": 59,
"14.186.46.148": 2,
"23.95.89.71": 11,
"27.3.9.57": 1,
"31.13.131.138": 1,
"31.223.251.10": 1,
"34.74.196.104": 148,
"36.90.212.84": 1,
"36.155.113.40": 84,
"37.49.226.19": 136,
"37.252.189.70": 40,
"40.83.125.50": 85,
"41.38.206.180": 2,
"41.92.108.110": 1,
"42.159.201.45": 42,
"45.67.14.21": 2,
"45.83.66.227": 1,
"45.95.168.247": 18,
"45.95.168.251": 18,
"45.120.69.97": 113,
"49.88.112.69": 2,
"49.88.112.72": 2,
"49.231.182.35": 77,
"51.15.99.106": 97,
"51.77.212.235": 32,
"51.91.102.49": 1,
"51.91.140.218": 45,
"52.186.168.121": 18,
"58.11.27.198": 1,
"58.150.46.6": 139,
"61.19.146.226": 25,
"61.78.107.61": 5,
"61.187.53.119": 147,
"65.49.20.67": 1,
"65.49.20.68": 1,
"71.6.232.6": 1,
"71.244.214.232": 10,
"79.137.72.121": 36,
"80.82.64.124": 8,
"83.97.20.31": 1,
"88.146.200.8": 1,
"88.251.163.3": 1,
"89.248.174.3": 1,
"89.248.174.151": 2,
"92.118.161.41": 1,
"92.222.36.74": 5,

# IP to geo
data is from https://github.com/datasets/geoip2-ipv4/blob/master/data/geoip2-ipv4.csv

column definitions are at https://dev.maxmind.com/geoip/geoip2/geoip2-city-country-csv-databases/

In [142]:
geoip_df = pandas.read_csv('geoip2-ipv4.txt')

In [145]:
geoip_df.shape

(172754, 8)

In [143]:
geoip_df.head()

Unnamed: 0,network,geoname_id,continent_code,continent_name,country_iso_code,country_name,is_anonymous_proxy,is_satellite_provider
0,41.74.160.0/20,49518.0,AF,Africa,RW,Rwanda,0,0
1,41.77.160.0/22,49518.0,AF,Africa,RW,Rwanda,0,0
2,41.138.80.0/21,49518.0,AF,Africa,RW,Rwanda,0,0
3,41.186.0.0/16,49518.0,AF,Africa,RW,Rwanda,0,0
4,41.197.0.0/16,49518.0,AF,Africa,RW,Rwanda,0,0


to convert geoname to location, see

https://download.geonames.org/export/dump/
    
as per https://github.com/maxmind/GeoIP2-java/issues/103

In [147]:
ipaddress.IPv4Address('192.0.2.6') in ipaddress.IPv4Network('192.0.2.0/28')


True

In [157]:
start_time = time.time()
ip_as_int_and_geoname_id = {}
for index, row in geoip_df.iterrows():
    if index>4:
        #print(row['network'],row['geoname_id'])
        for addr in ipaddress.IPv4Network(row['network']):
            ip_as_int_and_geoname_id[int(addr)] = row['country_name']#row['geoname_id']
print('elapsed',round(time.time() - start_time,2),'seconds')

KeyboardInterrupt: 

In [160]:
len(ip_as_int_and_geoname_id)

112026

In [165]:
# https://docs.python.org/3/library/sys.html#sys.getsizeof
# Returns the size of an object in bytes
sys.getsizeof(ip_as_int_and_geoname_id)/(1024*1024)

5.000091552734375

In [166]:
sys.getsizeof(geoip_df)/(1024*1024)

55.355018615722656