# Web Stats social.adrianistan.eu

## Download data

In [1]:
URL = "https://adrianistanlogs.blob.core.windows.net/social-adrianistan-eu/logs.parquet?sp=r&st=2024-10-20T15:52:48Z&se=2025-10-20T23:52:48Z&spr=https&sv=2022-11-02&sr=c&sig=sBCORc%2FvSvSOHGOI%2BFynAUzgxlxGNqsvWDr7%2B7LfWmA%3D"

In [2]:
from urllib.request import urlretrieve

urlretrieve(URL, "logs-social.parquet")

('logs-social.parquet', <http.client.HTTPMessage at 0x72129c00c890>)

# Analysis

1. Most popular pages
2. Data sent
3. Browsers / Operating systems
4. Referrals
5. Protocols

In [3]:
import duckdb
import pandas as pd

%load_ext sql
conn = duckdb.connect()
%sql conn --alias duckdb
%config SqlMagic.displaylimit = None

In [4]:
%%sql
SELECT * FROM read_parquet('logs-social.parquet')
LIMIT 50

remote_addr,remote_user,time,status,body_bytes_sent,http_referer,http_user_agent,http_x_forwarded_for,method,path,protocol
3.84.57.0,-,2024-10-19 19:44:38+02:00,200,4,-,http.rb/5.2.0 (Mastodon/3.5.19-qoto; +https://qoto.org/),-,POST,/inbox,HTTP/1.1
66.148.120.148,-,2024-10-19 19:44:53+02:00,200,4,-,http.rb/5.1.1 (Mastodon/4.1.19; +https://mastodon.sdf.org/),-,POST,/inbox,HTTP/1.1
46.226.110.18,-,2024-10-19 19:45:05+02:00,200,4,-,http.rb/5.1.1 (Mastodon/4.2.13; +https://hostux.social/),-,POST,/inbox,HTTP/1.1
134.122.91.198,-,2024-10-19 19:46:16+02:00,200,4,-,http.rb/5.1.1 (Mastodon/4.2.10+hometown-1.1.1; +https://merveilles.town/),-,POST,/inbox,HTTP/1.1
205.166.94.38,-,2024-10-19 19:47:33+02:00,200,4,-,http.rb/5.1.1 (Mastodon/4.1.19; +https://mastodon.sdf.org/),-,POST,/inbox,HTTP/1.1
168.119.6.104,-,2024-10-19 19:48:01+02:00,200,219473,-,Go-http-client/2.0,-,GET,/api/v1/timelines/public?limit=100&since_id=AnAGGd26JEU9Em7fTE,HTTP/2.0
3.86.9.227,-,2024-10-19 19:48:22+02:00,200,4,-,http.rb/5.2.0 (Mastodon/3.5.19-qoto; +https://qoto.org/),-,POST,/inbox,HTTP/1.1
142.132.159.116,-,2024-10-19 19:48:50+02:00,200,4,-,Mastodon/4.3.0 (http.rb/5.2.0; +https://mas.to/),-,POST,/inbox,HTTP/1.1
134.122.91.198,-,2024-10-19 19:49:24+02:00,200,4,-,http.rb/5.1.1 (Mastodon/4.2.10+hometown-1.1.1; +https://merveilles.town/),-,POST,/inbox,HTTP/1.1
66.148.120.148,-,2024-10-19 19:49:31+02:00,200,4,-,http.rb/5.1.1 (Mastodon/4.1.19; +https://mastodon.sdf.org/),-,POST,/inbox,HTTP/1.1


## Most popular pages


In [5]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE
    time >= DATE '2024-10-01' AND
    time <= DATE '2024-10-31'
GROUP BY path
ORDER BY count DESC
LIMIT 50

path,count
/inbox,14
/api/v1/timelines/public?limit=100&since_id=AnAGGd26JEU9Em7fTE,1


## Most popular pages (filtering probably robots)

In [6]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE
    time >= DATE '2024-10-01' AND
    time <= DATE '2024-10-31' AND
    http_user_agent LIKE 'Mozilla%' AND
    remote_addr NOT IN (
      SELECT DISTINCT remote_addr
      FROM read_parquet('logs-social.parquet')
      WHERE path = '/robots.txt' OR path = '/rss.xml'
    )
GROUP BY path
ORDER BY count DESC
LIMIT 50

path,count


## Data sent (in MB)

In [7]:
%%sql
SELECT SUM(body_bytes_sent)/1000000 FROM read_parquet('logs-social.parquet')

(sum(body_bytes_sent) / 1000000)
0.219529


In [8]:
%%sql
SELECT path, SUM(body_bytes_sent)/1000000 AS data_size
FROM read_parquet('logs-social.parquet')
GROUP BY path
ORDER BY data_size DESC
LIMIT 20

path,data_size
/api/v1/timelines/public?limit=100&since_id=AnAGGd26JEU9Em7fTE,0.219473
/inbox,5.6e-05


## Browsers and Operating systems

In [9]:
%%sql
SELECT http_user_agent, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
GROUP BY http_user_agent
ORDER BY count DESC
LIMIT 20

http_user_agent,count
http.rb/5.1.1 (Mastodon/4.1.19; +https://mastodon.sdf.org/),4
http.rb/5.1.1 (Mastodon/4.2.10+hometown-1.1.1; +https://merveilles.town/),2
http.rb/5.2.0 (Mastodon/3.5.19-qoto; +https://qoto.org/),2
Mastodon/4.3.0 (http.rb/5.2.0; +https://mas.to/),2
Mastodon/4.3.0+pr-32577-ba659d5 (http.rb/5.2.0; +https://mastodon.social/),2
http.rb/5.1.1 (Mastodon/4.2.13; +https://hostux.social/),1
Go-http-client/2.0,1
Mastodon/4.3.0+glitch (http.rb/5.2.0; +https://mstdn.ca/),1


In [10]:
%%sql
SELECT 'Firefox' AS browser, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Firefox%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Chrome' AS browser, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Chrome%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Safari' AS browser, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Safari%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Edge' AS browser, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Edge%' AND http_user_agent NOT LIKE '%bot%'

browser,count
Safari,0
Edge,0
Firefox,0
Chrome,0


In [11]:
%%sql
SELECT 'Linux' AS os, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Linux%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Windows' AS os, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Windows%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'macOS' AS os, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Mac%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'Android' AS os, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%Android%' AND http_user_agent NOT LIKE '%bot%'
UNION
SELECT 'iOS' AS os, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent LIKE '%iOS%' AND http_user_agent NOT LIKE '%bot%'

os,count
Linux,0
macOS,0
Windows,0
iOS,0
Android,0


In [12]:
%%sql
SELECT http_user_agent, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE http_user_agent NOT LIKE 'Mozilla%'
GROUP BY http_user_agent
ORDER BY count DESC
LIMIT 20

http_user_agent,count
http.rb/5.1.1 (Mastodon/4.1.19; +https://mastodon.sdf.org/),4
http.rb/5.2.0 (Mastodon/3.5.19-qoto; +https://qoto.org/),2
Mastodon/4.3.0 (http.rb/5.2.0; +https://mas.to/),2
Mastodon/4.3.0+pr-32577-ba659d5 (http.rb/5.2.0; +https://mastodon.social/),2
http.rb/5.1.1 (Mastodon/4.2.10+hometown-1.1.1; +https://merveilles.town/),2
Go-http-client/2.0,1
Mastodon/4.3.0+glitch (http.rb/5.2.0; +https://mstdn.ca/),1
http.rb/5.1.1 (Mastodon/4.2.13; +https://hostux.social/),1


## Referrals

In [13]:
%%sql
SELECT http_referer, COUNT(*) count
FROM read_parquet('logs-social.parquet')
WHERE http_referer != '-'
GROUP BY http_referer
ORDER BY count DESC
LIMIT 20

http_referer,count


In [14]:
%%sql
SELECT http_referer, COUNT(*) count
FROM read_parquet('logs-social.parquet')
WHERE 
    http_referer != '-' AND
    http_referer NOT LIKE 'https://social.com%' AND
    http_referer NOT LIKE 'http://social.com%' AND
    http_referer NOT LIKE 'http://90.170.32.205%'
GROUP BY http_referer
ORDER BY count DESC
LIMIT 20

http_referer,count


## Protocols

In [15]:
%%sql
SELECT protocol, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
GROUP BY protocol
ORDER BY count DESC
LIMIT 5

protocol,count
HTTP/1.1,14
HTTP/2.0,1


In [16]:
%%sql
SELECT method, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
GROUP BY method
ORDER BY count DESC
LIMIT 10

method,count
POST,14
GET,1


## Broken pages

In [17]:
%%sql
SELECT path, COUNT(*) AS count
FROM read_parquet('logs-social.parquet')
WHERE
  status = 404
GROUP BY path
ORDER BY count DESC
LIMIT 20

path,count
