# Notebook Purpose
This notebook contains a handful of basic visualization about our datasette requests data. 

In [None]:
import pandas as pd

from usage_metrics.resources.sqlite import SQLiteManager

%config InlineBackend.figure_format = 'retina'

In [None]:
engine = SQLiteManager().get_engine()
print(engine)

with engine.connect() as con:
    logs = pd.read_sql_table("datasette_request_logs", con)

logs.info()

## Internet Egress

### GB per month
This is pretty consistent with the "Cloud Run Network Internet Egress North America to North America" Google billing metric.

In [None]:
logs.set_index("timestamp").resample("1M").response_size.sum() / (10**9)

### MBs requested per week

In [None]:
weekly_mb_sent = logs.set_index("timestamp").resample("1W").response_size.sum() / (
    10**6
)

In [None]:
weekly_mb_sent.plot.bar(xlabel="Week", ylabel="MB of Data Requested", logy=True)

## Request Volume
Number of requests for ferc1 and pudl tables and downloads per week.

In [None]:
logs.set_index("timestamp").resample("1W").count().insert_id.plot.bar(
    xlabel="Week", ylabel="Number of Data Requests", logy=True
)

## Top Data Requests
The top datasette path requests. This includes json, csv and db downloads.

### Number of top data requests

In [None]:
top_n = 20

logs.request_url_path.value_counts().head(top_n).sort_values().plot.barh(
    figsize=(20, 10),
    fontsize=20,
    ylabel="Number of Requests",
    title="Requests by Data Path",
)

### Percent of all requests

In [None]:
top_n = 20

top_paths = logs.request_url_path.value_counts(normalize=True).head(top_n)

top_paths.sort_values().plot.barh(
    figsize=(20, 10),
    fontsize=20,
    ylabel="Percent of Requests",
    title="Requests by Data Path",
)

print(f"This chart shows {top_paths.sum() * 100} % of all data requests.")

## Top Organizations
Most organizations are generic internet providers however universities often have their own network set up. Unfortunately ipinfo charges for organization type so we have do some simple string filtering for now.  

### All orgs

In [None]:
logs.remote_ip_org.value_counts().head(10)

### Academic Institutions

In [None]:
university_substrings = ["Universitaet", "University", "College", "Institute"]

is_uni = logs.remote_ip_org.str.contains("|".join(university_substrings))

logs[is_uni].remote_ip_org.value_counts().sort_values().plot.barh(
    figsize=(20, 10),
    fontsize=20,
    ylabel="Number of Requests",
    title="Requests by Academic Organization",
)

## Top Cities

In [None]:
top_n = 20

full_location = (
    logs.remote_ip_city + ", " + logs.remote_ip_region + ", " + logs.remote_ip_country
)

full_location.value_counts().head(top_n).sort_values().plot.barh(
    figsize=(20, 10),
    fontsize=20,
    ylabel="Number of Requests",
    title="Requests by City",
    logx=True,
)

## Top referers

In [None]:
logs.referer.isna().value_counts()

Most referers are null.

In [None]:
top_n = 5

logs.referer.value_counts().head(top_n).sort_values().plot.barh(
    figsize=(20, 10), fontsize=20, ylabel="Number of Referals", title="Top Referers"
)