In [None]:
import pandas as pd
from pathlib import Path

In [None]:
from usage_metrics.ops.datasette import geocode_ips

In [None]:
local_dir = Path("../data/pudl_s3_logs/").resolve()

In [None]:
from tqdm import tqdm

dfs = []

for filepath in tqdm(local_dir.glob('*')):
    # print(f"Reading file: {filepath}")
    dfs.append(pd.read_csv(filepath, delimiter=" ", header=None))

In [None]:
df = pd.concat(dfs)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df[2] = df[2] + " " + df[3]
df = df.drop(columns=[3])

In [None]:
field_names = [
    'bucket_owner',
    'bucket',
    'time',
    'remote_ip',
    'requester',
    'request_id',
    'operation',
    'key',
    'request_uri',
    'http_status',
    'error_code',
    'bytes_sent',
    'object_size',
    'total_time',
    'turn_around_time',
    'referer',
    'user_agent',
    'version_id',
    'host_id',
    'signature_version',
    'cipher_suite',
    'authentication_type',
    'host_header',
    'tls_version',
    'access_point_arn',
    'acl_required'
]

df.columns = field_names
df.head()

In [None]:
df.remote_ip.eq("-").value_counts()

In [None]:
df.remote_ip.nunique()

In [None]:
from dagster import build_op_context

df["remote_ip"] = df["remote_ip"].mask(df["remote_ip"].eq("-"), pd.NA)

context = build_op_context()
geocoded_df = geocode_ips(context, df)

In [None]:
geocoded_df[geocoded_df.remote_ip_org.str.contains("University|College|Institute", na=False)].remote_ip_org.value_counts()

In [None]:
geocoded_df.info()

In [None]:
format_string = '[%d/%b/%Y:%H:%M:%S %z]'

# Convert string to datetime using Pandas
geocoded_df["timestamp"] = pd.to_datetime(geocoded_df.time, format=format_string)

In [None]:
geocoded_df.groupby(pd.Grouper(key='timestamp', freq='M')).remote_ip.nunique().plot.bar()

In [None]:
geocoded_df.head()

In [None]:
geocoded_df.operation.value_counts()

In [None]:
geocoded_df.operation.str.contains("REST.GET").value_counts()

In [None]:
gets = geocoded_df[geocoded_df.operation.str.contains("REST.GET")]

In [None]:
gets["bytes_sent"].eq("-").value_counts()

In [None]:
gets["bytes_sent"] = gets["bytes_sent"].mask(gets["bytes_sent"].eq("-"), 0)
gets["bytes_sent"].eq("-").value_counts()
gets = gets.convert_dtypes()

In [None]:
gets.info()

In [None]:
gets.groupby(pd.Grouper(key='timestamp', freq='M')).bytes_sent.sum().div(1024 ** 3).plot.bar()

In [None]:
gets = gets.drop(columns=["time"])

In [None]:
numeric_fields = ["bytes_sent", "http_status", "object_size", "total_time", "turn_around_time"]
for field in numeric_fields:
    gets[field] = pd.to_numeric(gets[field], errors='coerce')

gets.info()

In [None]:
monthly_ips = gets.groupby(pd.Grouper(key='timestamp', freq='M')).remote_ip.nunique()

monthly_ips.plot.bar()

In [None]:
monthly_ips.describe()

In [None]:
monthly_gbs = gets.groupby(pd.Grouper(key='timestamp', freq='M')).bytes_sent.sum().div(1024 ** 3)

monthly_ips.plot.bar()

In [None]:
monthly_gbs.describe()