# Web Stats files.adrianistan.eu

## Download data

In [3]:
URL = "https://adrianistanlogs.blob.core.windows.net/files-adrianistan-eu/logs.parquet?sp=r&st=2024-10-20T15:42:14Z&se=2025-10-20T23:42:14Z&spr=https&sv=2022-11-02&sr=c&sig=ov2oGR6A1bXAbjpOOhUUfPZNVv1aShUeJ5U46%2FuZU40%3D"

In [4]:
from urllib.request import urlretrieve

urlretrieve(URL, "logs-files.parquet")

('logs-files.parquet', <http.client.HTTPMessage at 0x76405ff72cc0>)

# Analysis

1. Most popular pages
2. Data sent
3. Browsers / Operating systems
4. Referrals
5. Protocols

In [5]:
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect()
%sql conn --alias duckdb
%config SqlMagic.displaylimit = None

In [6]:
%%sql
SELECT * FROM read_parquet('logs-files.parquet')
LIMIT 50

remote_addr,remote_user,time,status,body_bytes_sent,http_referer,http_user_agent,http_x_forwarded_for,method,path,protocol
35.204.148.234,-,2024-10-11 14:03:27+02:00,403,153,-,Scrapy/2.11.2 (+https://scrapy.org),-,GET,/,HTTP/1.1
187.189.57.19,-,2024-10-11 14:04:49+02:00,200,304396,https://blog.adrianistan.eu/,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",-,GET,/hillbert6.svg,HTTP/2.0
187.189.57.19,-,2024-10-11 14:04:49+02:00,200,69156,https://blog.adrianistan.eu/,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",-,GET,/hillbert_all.png,HTTP/2.0
66.249.75.32,-,2024-10-11 14:15:39+02:00,200,142303,-,Googlebot-Image/1.0,-,GET,/AWS.png,HTTP/1.1
139.214.152.210,-,2024-10-11 14:47:47+02:00,403,555,-,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.0 Safari/537.36",-,GET,/,HTTP/1.1
93.190.141.60,-,2024-10-11 14:55:55+02:00,200,64622,https://blog.adrianistan.eu/,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0,-,GET,/ProyectosBOINC.png,HTTP/2.0
93.190.141.60,-,2024-10-11 14:55:55+02:00,200,61015,https://blog.adrianistan.eu/,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0,-,GET,/MainBOINC.png,HTTP/2.0
93.190.141.60,-,2024-10-11 14:55:55+02:00,200,100279,https://blog.adrianistan.eu/,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0,-,GET,/MareNostrum4.jpg,HTTP/2.0
93.190.141.60,-,2024-10-11 14:55:55+02:00,200,58203,https://blog.adrianistan.eu/,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0,-,GET,/BOINC-1024x428.png,HTTP/2.0
93.190.141.60,-,2024-10-11 14:55:55+02:00,200,27998,https://blog.adrianistan.eu/,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0,-,GET,/LoginBOINC.png,HTTP/2.0


## Most popular pages


In [7]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE
    time >= DATE '2024-10-01' AND
    time <= DATE '2024-10-31'
GROUP BY path
ORDER BY count DESC
LIMIT 50

path,count
/robots.txt,125
/pandas_logo.png,81
/RiscLogical.png,58
/Backtracking.png,51
/PrologFlow.png,50
/,46
/RiscLoadStore.png,45
/SWIPrologLogo.png,45
/RiscPseudo.png,41
/PrologHelloWorld.png,40


## Most popular pages (filtering probably robots)

In [8]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE
    time >= DATE '2024-10-01' AND
    time <= DATE '2024-10-31' AND
    http_user_agent LIKE 'Mozilla%' AND
    remote_addr NOT IN (
      SELECT DISTINCT remote_addr
      FROM read_parquet('logs-files.parquet')
      WHERE path = '/robots.txt' OR path = '/rss.xml'
    )
GROUP BY path
ORDER BY count DESC
LIMIT 50

path,count
/pandas_logo.png,81
/RiscLogical.png,58
/Backtracking.png,50
/PrologFlow.png,49
/SWIPrologLogo.png,45
/RiscLoadStore.png,45
/RiscPseudo.png,41
/PrologHelloWorld.png,40
/RiscArithmetic.png,36
/RiscBranching.png,36


## Data sent (in MB)

In [9]:
%%sql
SELECT SUM(body_bytes_sent)/1000000 FROM read_parquet('logs-files.parquet')

(sum(body_bytes_sent) / 1000000)
589.041157


In [10]:
%%sql
SELECT path, SUM(body_bytes_sent)/1000000 AS data_size
FROM read_parquet('logs-files.parquet')
GROUP BY path
ORDER BY data_size DESC
LIMIT 20

path,data_size
/RLBasket.png,29.17965
/scryer-prolog-meetup.jpeg,17.52412
/IMG_20231109_095022.jpg,14.369999
/wc.pdf,14.34708
/stones-cusco-2608832_1920.jpg,13.28624
/The%20Outside%20World-1.pdf,12.937224
/Rust.jpg,12.6518
/IMG_20231110_111317.jpg,11.76159
/IMG_20181204_211011.jpg,11.22734
/IMG_20231109_140816.jpg,10.116536


## Browsers and Operating systems

In [11]:
%%sql
SELECT http_user_agent, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
GROUP BY http_user_agent
ORDER BY count DESC
LIMIT 20

http_user_agent,count
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",955
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0",358
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Mobile Safari/537.36",310
Googlebot-Image/1.0,227
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0,178
Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/),142
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",128
Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0,120
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 OPR/113.0.0.0",102
"Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; spider-feedback@bytedance.com)",88


In [12]:
%%sql
SELECT 'Firefox' AS browser, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Firefox%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Chrome' AS browser, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Chrome%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Safari' AS browser, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Safari%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Edge' AS browser, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Edge%' AND http_user_agent NOT LIKE '%bot%'

browser,count
Edge,0
Safari,2806
Chrome,2512
Firefox,464


In [13]:
%%sql
SELECT 'Linux' AS os, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Linux%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Windows' AS os, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Windows%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'macOS' AS os, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Mac%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Android' AS os, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%Android%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'iOS' AS os, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent LIKE '%iOS%' AND http_user_agent NOT LIKE '%bot%'

os,count
Windows,2035
iOS,11
Linux,904
macOS,322
Android,502


In [14]:
%%sql
SELECT http_user_agent, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE http_user_agent NOT LIKE 'Mozilla%'
GROUP BY http_user_agent
ORDER BY count DESC
LIMIT 20

http_user_agent,count
Googlebot-Image/1.0,227
facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php),54
'DuckDuckBot-Https/1.1; (+https://duckduckgo.com/duckduckbot)',42
okhttp/4.12.0,11
python-requests/2.31.0,9
Safari/17618.3.11.11.7 CFNetwork/1335.5 Darwin/21.6.0,8
com.apple.WebKit.Networking/20619.1.26.31.7 Network/4277.1.7 macOS/15.0.1,6
-,5
com.google.android.apps.searchlite/979875 (Linux; U; Android 12; es_US; SM-A032M; Build/SP1A.210812.016; Cronet/129.0.6614.4),3
"Expanse, a Palo Alto Networks company, searches across the global IPv4 space multiple times per day to identify customers' presences on the Internet. If you would like to be excluded from our scans, please send IP addresses/domains to: scaninfo@paloaltonetworks.com",3


## Referrals

In [15]:
%%sql
SELECT http_referer, COUNT(*) count
FROM read_parquet('logs-files.parquet')
WHERE http_referer != '-'
GROUP BY http_referer
ORDER BY count DESC
LIMIT 20

http_referer,count
https://blog.adrianistan.eu/,2647
https://www.google.com/,212
https://www.bing.com/,13
https://beta.adrianistan.eu/,13
https://blog.adrianistan.eu/reinforcement-learning-aprendizaje-refuerzo-que-es-parte-1/,10
https://blog.adrianistan.eu/ipfs-el-futuro-de-la-web-descentralizada/,10
https://blog-adrianistan-eu.translate.goog/,8
https://blog.adrianistan.eu/htmx-html-hipertexto-en-serio/,8
https://blog.adrianistan.eu/tutorial-piston-programa-juegos-rust/,6
https://blog.adrianistan.eu/mapas-interactivos-html5-snapsvg/,6


In [20]:
%%sql
SELECT http_referer, COUNT(*) count
FROM read_parquet('logs-files.parquet')
WHERE 
    http_referer != '-' AND
    http_referer NOT LIKE 'https://blog.adrianistan.eu%' AND
    http_referer NOT LIKE 'http://blog.adrianistan.eu%' AND
    http_referer NOT LIKE 'http://90.170.32.205%'
GROUP BY http_referer
ORDER BY count DESC
LIMIT 20

http_referer,count
https://www.google.com/,212
https://beta.adrianistan.eu/,13
https://www.bing.com/,13
https://blog-adrianistan-eu.translate.goog/,8
https://myactivity.google.com/,6
https://explorer.globe.engineer/,5
https://files.adrianistan.eu/RISC-V-cheatsheet.pdf,4
https://www.google.com.ar/,3
https://www.google.es/,3
https://files.adrianistan.eu/,2


## Protocols

In [17]:
%%sql
SELECT protocol, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
GROUP BY protocol
ORDER BY count DESC
LIMIT 5

protocol,count
HTTP/2.0,3419
HTTP/1.1,591


In [18]:
%%sql
SELECT method, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
GROUP BY method
ORDER BY count DESC
LIMIT 10

method,count
GET,3999
POST,9
HEAD,2


## Broken pages

In [19]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-files.parquet')
WHERE
  status = 404
GROUP BY path
ORDER BY count DESC
LIMIT 20

path,count
/robots.txt,122
/favicon.ico,24
/wp-login.php,2
/.git/config,1
