# Web Stats blog.adrianistan.eu

## Download data

In [1]:
URL = "https://adrianistanlogs.blob.core.windows.net/blog-adrianistan-eu/logs.parquet?sp=r&st=2024-10-20T14:41:26Z&se=2025-10-20T22:41:26Z&spr=https&sv=2022-11-02&sr=c&sig=Nl50RZu4RhprAx%2F0RD%2F1IakEh89jtAiJhNsRBAod7yE%3D"

In [2]:
from urllib.request import urlretrieve

urlretrieve(URL, "logs-blog.parquet")

('logs-blog.parquet', <http.client.HTTPMessage at 0x72c71538c1d0>)

# Analysis

1. Most popular pages
2. Data sent
3. Browsers / Operating systems
4. Referrals
5. Protocols

In [5]:
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect()
%sql conn --alias duckdb
%config SqlMagic.displaylimit = None

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [6]:
%%sql
SELECT * FROM read_parquet('logs-blog.parquet')
LIMIT 50

remote_addr,remote_user,time,status,body_bytes_sent,http_referer,http_user_agent,http_x_forwarded_for,method,path,protocol
187.189.57.19,-,2024-10-11 14:04:48+02:00,200,426,https://blog.adrianistan.eu/curva-hilbert-prolog/,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",-,GET,/static/fonts.css,HTTP/2.0
187.189.57.19,-,2024-10-11 14:04:49+02:00,200,38740,https://blog.adrianistan.eu/static/fonts.css,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",-,GET,/static/IBMPlexMono-Regular.woff2,HTTP/2.0
187.189.57.19,-,2024-10-11 14:04:50+02:00,404,555,https://blog.adrianistan.eu/curva-hilbert-prolog/,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",-,GET,/favicon.ico,HTTP/2.0
73.138.36.34,-,2024-10-11 14:05:34+02:00,304,0,https://blog.adrianistan.eu/rss.xml,FreshRSS/1.23.1 (Linux; https://freshrss.org),-,GET,/rss.xml,HTTP/2.0
94.130.16.71,-,2024-10-11 14:06:21+02:00,200,35951,-,FeedsFun/1.11.1 (prod localhost),-,GET,/rss.xml,HTTP/1.1
216.244.66.237,-,2024-10-11 14:09:35+02:00,200,1892,-,Mozilla/5.0 (compatible; DotBot/1.2; +https://opensiteexplorer.org/dotbot; help@moz.com),-,GET,/tag/gamedevhispano/?date=1481587200,HTTP/1.1
54.36.148.74,-,2024-10-11 14:11:49+02:00,301,169,-,Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/),-,GET,/tag/uphold,HTTP/2.0
66.39.4.53,-,2024-10-11 14:13:37+02:00,404,153,https://blog.adrianistan.eu/feed/,FreshRSS/1.22.1 (Linux; https://freshrss.org),-,GET,/feed/,HTTP/2.0
5.9.7.215,-,2024-10-11 14:13:40+02:00,304,0,-,Bifolia:0.1.1,-,GET,/rss.xml,HTTP/2.0
167.60.43.90,-,2024-10-11 14:15:13+02:00,304,0,https://blog.adrianistan.eu/rss.xml,FreshRSS/1.24.3 (Linux; https://freshrss.org),-,GET,/rss.xml,HTTP/2.0


## Most popular pages


In [7]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE
    time >= DATE '2024-10-01' AND
    time <= DATE '2024-10-31'
GROUP BY path
ORDER BY count DESC
LIMIT 50

path,count
/rss.xml,14414
/,2564
/favicon.ico,1557
/robots.txt,1043
/static/fonts.css,880
/static/main.css,873
/static/map.webp,793
/static/IBMPlexSerif-Regular.woff2,722
/static/IBMPlexMono-Regular.woff2,715
/static/favicon.ico,388


## Most popular pages (filtering probably robots)

In [8]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE
    time >= DATE '2024-10-01' AND
    time <= DATE '2024-10-31' AND
    http_user_agent LIKE 'Mozilla%' AND
    remote_addr NOT IN (
      SELECT DISTINCT remote_addr
      FROM read_parquet('logs-blog.parquet')
      WHERE path = '/robots.txt' OR path = '/rss.xml'
    )
GROUP BY path
ORDER BY count DESC
LIMIT 50

path,count
/favicon.ico,851
/static/fonts.css,815
/static/main.css,811
/static/map.webp,738
/,705
/static/IBMPlexSerif-Regular.woff2,704
/static/IBMPlexMono-Regular.woff2,698
/static/favicon.ico,382
/haz-scripts/,255
/estadistica-python-media-mediana-varianza-percentiles-parte-iii/,205


## Data sent (in MB)

In [9]:
%%sql
SELECT SUM(body_bytes_sent)/1000000 FROM read_parquet('logs-blog.parquet')

(sum(body_bytes_sent) / 1000000)
826.588415


In [10]:
%%sql
SELECT path, SUM(body_bytes_sent)/1000000 AS data_size
FROM read_parquet('logs-blog.parquet')
GROUP BY path
ORDER BY data_size DESC
LIMIT 20

path,data_size
/static/map.webp,537.147901
/rss.xml,114.935516
/static/IBMPlexSerif-Regular.woff2,35.496348
/static/IBMPlexMono-Regular.woff2,26.952922
/,12.350663
/static/IBMPlexSerif-Regular.ttf,3.477964
/static/IBMPlexMono-Regular.ttf,2.870228
/archivo/,2.756689
/tutorial-de-cmake/,2.671659
/supertutorial-prolog/,2.409678


## Browsers and Operating systems

In [33]:
%%sql
SELECT http_user_agent, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
GROUP BY http_user_agent
ORDER BY count DESC
LIMIT 20

http_user_agent,count
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.6668.89 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",4871
Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/),2483
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",2282
Bifolia:0.1.1,2195
Mozilla/5.0 (compatible; AwarioBot/1.0; +https://awario.com/bots.html),1759
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)",1729
Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot.html),1324
-,1308
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 OPR/98.0.0.0",1182
FreshRSS/1.25.0-dev (Linux; https://freshrss.org),1097


In [14]:
%%sql
SELECT 'Firefox' AS browser, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Firefox%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Chrome' AS browser, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Chrome%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Safari' AS browser, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Safari%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Edge' AS browser, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Edge%' AND http_user_agent NOT LIKE '%bot%'

browser,count
Chrome,9083
Edge,56
Safari,9647
Firefox,2135


In [19]:
%%sql
SELECT 'Linux' AS os, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Linux%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Windows' AS os, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Windows%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'macOS' AS os, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Mac%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Android' AS os, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%Android%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'iOS' AS os, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent LIKE '%iOS%' AND http_user_agent NOT LIKE '%bot%'

os,count
Windows,6346
iOS,31
Android,1954
macOS,2815
Linux,7094


In [34]:
%%sql
SELECT http_user_agent, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE http_user_agent NOT LIKE 'Mozilla%'
GROUP BY http_user_agent
ORDER BY count DESC
LIMIT 20

http_user_agent,count
Bifolia:0.1.1,2195
-,1308
FreshRSS/1.25.0-dev (Linux; https://freshrss.org),1097
Feedbin feed-id:1818775 - 13 subscribers,1058
FreshRSS/1.24.3 (Linux; https://freshrss.org),994
Unread RSS Reader - https://www.goldenhillsoftware.com/unread/,945
FreshRSS/1.23.1 (Linux; https://freshrss.org),759
Gofeed/1.0,643
FreshRSS/1.21.0 (Linux; https://freshrss.org),608
Tiny Tiny RSS/UNKNOWN (Unsupported) (https://tt-rss.org/),548


## Referrals

In [35]:
%%sql
SELECT http_referer, COUNT(*) count
FROM read_parquet('logs-blog.parquet')
WHERE http_referer != '-'
GROUP BY http_referer
ORDER BY count DESC
LIMIT 20

http_referer,count
https://blog.adrianistan.eu/rss.xml,3186
https://blog.adrianistan.eu/static/fonts.css,1342
https://www.google.com/,1210
https://blog.adrianistan.eu/,1160
https://blog.adrianistan.eu/sitemap.xml,1160
https://blog.adrianistan.eu/static/main.css,711
https://blog.adrianistan.eu/feed/,285
https://blog.adrianistan.eu/estadistica-python-media-mediana-varianza-percentiles-parte-iii/,252
https://blog.adrianistan.eu/haz-scripts/,251
www.google.com,244


In [36]:
%%sql
SELECT http_referer, COUNT(*) count
FROM read_parquet('logs-blog.parquet')
WHERE 
    http_referer != '-' AND
    http_referer NOT LIKE 'https://blog.adrianistan.eu%' AND
    http_referer NOT LIKE 'http://blog.adrianistan.eu%' AND
    http_referer NOT LIKE 'http://90.170.32.205%'
GROUP BY http_referer
ORDER BY count DESC
LIMIT 20

http_referer,count
https://www.google.com/,1210
www.google.com,244
https://www.google.com,115
https://www.bing.com/,45
https://iask.ai/,40
https://duckduckgo.com/,20
https://www.ecosia.org/,18
binance.com,14
https://www.google.com.mx/,13
https://search.brave.com/,10


## Protocols

In [29]:
%%sql
SELECT protocol, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
GROUP BY protocol
ORDER BY count DESC
LIMIT 5

protocol,count
HTTP/1.1,24893
HTTP/2.0,22541
HTTP/1.0,757
,158
,17


In [31]:
%%sql
SELECT method, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
GROUP BY method
ORDER BY count DESC
LIMIT 10

method,count
GET,47862
HEAD,172
,158
POST,99
CONNECT,23
GET /shell?cd+/tmp;rm+-rf+*;wget+,21
,17
PRI,14
\x00\x00\x001\xFFSMBr\x00\x00\x00\x00\x18Eh\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xB5}\x00\x00\x01\x00\x00\x0E\x00\x02NT,2
\x16\x03\x01\x00\xEA\x01\x00\x00\xE6\x03\x03\xBEj@\xCA\x08\x049\x14q\x94\xCF>\xD7\x9F\xAB|\xDB@~\xFDN\x10t\xCA\xAC\xF7\x16\xC1\x1B\xD8\x02\xA1,1


## Broken pages

In [38]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-blog.parquet')
WHERE
  status = 404
GROUP BY path
ORDER BY count DESC
LIMIT 20

path,count
/favicon.ico,1523
/robots.txt,806
/static/favicon.ico,388
/feed/,323
/.well-known/traffic-advice,159
/2017/01/17/tutorial-piston-programa-juegos-rust/,62
/apple-touch-icon-precomposed.png,36
/apple-touch-icon.png,36
/wp-login.php,32
/category/programacion/feed/,28
