This analysis uses SQLite database file created from step01 and step02. The file name checked in with this project is aydevmo_net_log_01.db.

As of 1/31/2023, SQLAlchemy 2.0 has some compatibility issues with Pandas so we downgrade SQLAlchemy version to 1.4.46.

pip uninstall sqlalchemy
pip install --force-reinstall -v "sqlalchemy==1.4.46"

In [1]:
import pandas as pd
import numpy as np

import sqlalchemy as sqla
import os

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [11]:
sql_engine = sqla.create_engine("sqlite:///aydevmo_net_log_01.db")

df_host = pd.read_sql("select * from hosts limit 100", sql_engine)

df_host.head()

Unnamed: 0,ipv4_id,ipv4_string,hostname,latitude,longitude,country_code,country,city
0,19988767,1.49.1.31,,31.242,121.476,CN,China,Huangpu
1,20468882,1.56.84.146,,45.75,126.65,CN,China,Harbin
2,37239577,2.56.59.25,,52.3824,4.8995,NL,Netherlands,
3,40030346,2.98.208.138,,51.5538,-0.3102,GB,United Kingdom,Wembley
4,85797893,5.29.44.5,,32.0236,34.7522,IL,Israel,Bat Yam


In [13]:
df_rec = pd.read_sql("select * from log_records limit 200", sql_engine)

df_rec.head()

Unnamed: 0,log_id,created_on,log_number,protocol,source,destination,destination_port,occurrence,sum_minutes
0,1,2021-06-21 20:55:54.000000,106023,tcp,1547683146,3232242186,12489,1,0
1,2,2021-06-21 20:55:58.000000,106023,tcp,1509467513,3232242231,7742,1,0
2,3,2021-06-21 20:56:05.000000,106023,tcp,2818082075,3232241677,40000,1,0
3,4,2021-06-21 20:56:07.000000,106023,tcp,1509467513,3232241714,7007,1,0
4,5,2021-06-21 20:56:10.000000,106023,tcp,1509467462,3232241674,10040,1,0


In [51]:
df_probes = pd.read_sql("select * from log_records join hosts on log_records.source = hosts.ipv4_id", sql_engine)

df_probes.head()

Unnamed: 0,log_id,created_on,log_number,protocol,source,destination,destination_port,occurrence,sum_minutes,ipv4_id,ipv4_string,hostname,latitude,longitude,country_code,country,city
0,1,2021-06-21 20:55:54.000000,106023,tcp,1547683146,3232242186,12489,1,0,1547683146,92.63.197.74,,55.7386,37.6068,RU,Russia,
1,2,2021-06-21 20:55:58.000000,106023,tcp,1509467513,3232242231,7742,1,0,1509467513,89.248.165.121,,52.3759,4.8975,NL,Netherlands,Amsterdam
2,3,2021-06-21 20:56:05.000000,106023,tcp,2818082075,3232241677,40000,1,0,2818082075,167.248.133.27,,37.751,-97.822,US,United States,
3,4,2021-06-21 20:56:07.000000,106023,tcp,1509467513,3232241714,7007,1,0,1509467513,89.248.165.121,,52.3759,4.8975,NL,Netherlands,Amsterdam
4,5,2021-06-21 20:56:10.000000,106023,tcp,1509467462,3232241674,10040,1,0,1509467462,89.248.165.70,,52.3759,4.8975,NL,Netherlands,Amsterdam


In [52]:
df_probes.count()

log_id              6370
created_on          6370
log_number          6370
protocol            6370
source              6370
destination         6370
destination_port    6370
occurrence          6370
sum_minutes         6370
ipv4_id             6370
ipv4_string         6370
hostname            6370
latitude            6370
longitude           6370
country_code        6370
country             6370
city                3520
dtype: int64

In [53]:
def ipv4_int_to_string(num):
    '''Convert integer to IPv4 string'''
    result = []
    for i in range(4):
        result.insert(0, str(num & 0xff) )
        num >>= 8
    return '.'.join(result)

df_probes['dest_ipv4_string'] = df_probes['destination'].apply(ipv4_int_to_string)


In [54]:
def ipv4_int_to_string_subnet_24(num):
    '''Convert integer to IPv4 string'''
    result = []
    for i in range(4):
        result.insert(0, str(num & 0xff) )
        num >>= 8
    result[3] = '0'
    return '.'.join(result)

df_probes['src_subnet_24'] = df_probes['source'].apply(ipv4_int_to_string_subnet_24)

Drop unnecessary columns.

In [55]:
df_probes = df_probes.drop(columns=['log_number', 'sum_minutes', 'ipv4_id', 'hostname'])

In [57]:
df_probes.rename(columns={'ipv4_string': 'src_ipv4_string'}, inplace=True)

cols = df_probes.columns.tolist()

cols

['log_id',
 'created_on',
 'protocol',
 'source',
 'destination',
 'destination_port',
 'occurrence',
 'src_ipv4_string',
 'latitude',
 'longitude',
 'country_code',
 'country',
 'city',
 'dest_ipv4_string',
 'src_subnet_24']

In [50]:
cols = \
['log_id',
 'created_on',
 'protocol',
 'source',
 'destination',
 'src_ipv4_string',
 'src_subnet_24',
 'dest_ipv4_string',
 'destination_port',
 'occurrence',
 'latitude',
 'longitude',
 'country_code',
 'country',
 'city' ]

df_probes = df_probes[cols]

df_probes.head()

Unnamed: 0,log_id,created_on,protocol,source,destination,src_ipv4_string,src_subnet_24,dest_ipv4_string,destination_port,latitude,longitude,country_code,country,city
0,1,2021-06-21 20:55:54.000000,tcp,1547683146,3232242186,92.63.197.74,92.63.197.0,192.168.26.10,12489,55.7386,37.6068,RU,Russia,
1,2,2021-06-21 20:55:58.000000,tcp,1509467513,3232242231,89.248.165.121,89.248.165.0,192.168.26.55,7742,52.3759,4.8975,NL,Netherlands,Amsterdam
2,3,2021-06-21 20:56:05.000000,tcp,2818082075,3232241677,167.248.133.27,167.248.133.0,192.168.24.13,40000,37.751,-97.822,US,United States,
3,4,2021-06-21 20:56:07.000000,tcp,1509467513,3232241714,89.248.165.121,89.248.165.0,192.168.24.50,7007,52.3759,4.8975,NL,Netherlands,Amsterdam
4,5,2021-06-21 20:56:10.000000,tcp,1509467462,3232241674,89.248.165.70,89.248.165.0,192.168.24.10,10040,52.3759,4.8975,NL,Netherlands,Amsterdam
