In [2]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.polars
import altair as alt
import polars.selectors as cs

In [3]:
print(pl.__version__)

1.22.0


In [None]:
df_path = r"F:\Datasets\txt_files\access.log"

In [4]:
df_lazy = pl.read_csv(
    df_path,
    separator="\0",
    has_header=False,
    ignore_errors=False,
    infer_schema_length=None,
    new_columns=["LogData"]
).lazy()


In [5]:
df_lazy = df_lazy.with_columns(
    pl.col('LogData').str.extract(r'((\d{1,3}\.){3}\d{1,3})').alias('IP'),
    pl.col("LogData").str.extract(r"(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4})").alias("Date"),
    pl.col('LogData').str.extract(r"(GET|POST|PUT|DELETE|HEAD|OPTIONS|PATCH)\s+([^\s]+)\s+([^\s]+)").alias('Request'),
    pl.col("LogData").str.extract(r'HTTP/1.1"\s(\d+)').alias("Status_Code"),
    pl.col("LogData").str.extract(r'HTTP/1.1"\s\d+\s(\d+)').alias("Response_Size"),
    pl.col("LogData").str.extract(r'"([^"]*)"\s*$').alias("Referer"),
    pl.col("LogData").str.extract(r'"([^"]*)"\s+"[^"]*"\s*$').alias("User_Agent"),
)

In [6]:
df_lazy = df_lazy.with_columns(
    pl.col("Date").str.strptime(pl.Datetime, "%d/%b/%Y:%H:%M:%S %z"),
    pl.col("Status_Code").cast(pl.UInt16),
    pl.col("Response_Size").cast(pl.UInt32)
)

In [None]:
pl.scan_csv(df_path)

In [8]:
df = df_lazy.collect()

In [11]:
df.estimated_size('gb')

1.3641958562657237

In [12]:
df

IP,Date,Request,Status_Code,Response_Size,Referer,User_Agent
str,"datetime[μs, UTC]",str,u16,u32,str,str
"""54.36.149.41""",2019-01-22 00:26:14 UTC,"""GET""",200,30577,"""-""","""Mozilla/5.0 (compatible; Ahref…"
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5667,"""-""","""Mozilla/5.0 (Linux; Android 6.…"
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5379,"""-""","""Mozilla/5.0 (Linux; Android 6.…"
"""40.77.167.129""",2019-01-22 00:26:17 UTC,"""GET""",200,1696,"""-""","""Mozilla/5.0 (compatible; bingb…"
"""91.99.72.15""",2019-01-22 00:26:17 UTC,"""GET""",200,41483,"""-""","""Mozilla/5.0 (Windows NT 6.2; W…"
…,…,…,…,…,…,…
"""188.229.21.56""",2019-01-26 16:59:13 UTC,"""GET""",302,0,"""-""","""Mozilla/5.0 (Linux; Android 7.…"
"""5.127.220.71""",2019-01-26 16:59:13 UTC,"""GET""",404,32420,"""-""","""MobileSafari/604.1 CFNetwork/9…"
"""5.213.7.50""",2019-01-26 16:59:13 UTC,"""GET""",200,20959,"""-""","""Mozilla/5.0 (iPhone; CPU iPhon…"
"""109.125.169.52""",2019-01-26 16:59:13 UTC,"""GET""",200,5,"""-""","""Mozilla/5.0 (Windows NT 6.1; r…"


In [22]:
df.with_columns(
    pl.col('Date').dt.weekday().alias('Weekday')
)

IP,Date,Request,Status_Code,Response_Size,Referer,User_Agent,Weekday
str,"datetime[μs, UTC]",str,u16,u32,str,str,i8
"""54.36.149.41""",2019-01-22 00:26:14 UTC,"""GET""",200,30577,"""-""","""Mozilla/5.0 (compatible; Ahref…",2
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5667,"""-""","""Mozilla/5.0 (Linux; Android 6.…",2
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5379,"""-""","""Mozilla/5.0 (Linux; Android 6.…",2
"""40.77.167.129""",2019-01-22 00:26:17 UTC,"""GET""",200,1696,"""-""","""Mozilla/5.0 (compatible; bingb…",2
"""91.99.72.15""",2019-01-22 00:26:17 UTC,"""GET""",200,41483,"""-""","""Mozilla/5.0 (Windows NT 6.2; W…",2
…,…,…,…,…,…,…,…
"""188.229.21.56""",2019-01-26 16:59:13 UTC,"""GET""",302,0,"""-""","""Mozilla/5.0 (Linux; Android 7.…",6
"""5.127.220.71""",2019-01-26 16:59:13 UTC,"""GET""",404,32420,"""-""","""MobileSafari/604.1 CFNetwork/9…",6
"""5.213.7.50""",2019-01-26 16:59:13 UTC,"""GET""",200,20959,"""-""","""Mozilla/5.0 (iPhone; CPU iPhon…",6
"""109.125.169.52""",2019-01-26 16:59:13 UTC,"""GET""",200,5,"""-""","""Mozilla/5.0 (Windows NT 6.1; r…",6


In [24]:
pl.date_range(
    start="2000-01-01",
    end="2021-12-31",
    interval="1d"
)

In [33]:
df.group_by_dynamic(
    pl.col('Date').sort(), every='10m'
).agg(
    pl.sum('Response_Size').alias('Total_Response_Size')
)

Date,Total_Response_Size
"datetime[μs, UTC]",u32
2019-01-22 00:20:00 UTC,23748087
2019-01-22 00:30:00 UTC,49627795
2019-01-22 00:40:00 UTC,52008606
2019-01-22 00:50:00 UTC,45376413
2019-01-22 01:00:00 UTC,54409278
…,…
2019-01-26 16:10:00 UTC,325179136
2019-01-26 16:20:00 UTC,284297733
2019-01-26 16:30:00 UTC,261076904
2019-01-26 16:40:00 UTC,260260542


In [51]:
df.with_columns(
    rolling_sum=pl.col('Response_Size')
    .rolling_sum(
        window_size=1,
    ).alias('Rolling_Sum')
)

IP,Date,Request,Status_Code,Response_Size,Referer,User_Agent,rolling_sum
str,"datetime[μs, UTC]",str,u16,u32,str,str,u32
"""54.36.149.41""",2019-01-22 00:26:14 UTC,"""GET""",200,30577,"""-""","""Mozilla/5.0 (compatible; Ahref…",30577
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5667,"""-""","""Mozilla/5.0 (Linux; Android 6.…",5667
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5379,"""-""","""Mozilla/5.0 (Linux; Android 6.…",5379
"""40.77.167.129""",2019-01-22 00:26:17 UTC,"""GET""",200,1696,"""-""","""Mozilla/5.0 (compatible; bingb…",1696
"""91.99.72.15""",2019-01-22 00:26:17 UTC,"""GET""",200,41483,"""-""","""Mozilla/5.0 (Windows NT 6.2; W…",41483
…,…,…,…,…,…,…,…
"""188.229.21.56""",2019-01-26 16:59:13 UTC,"""GET""",302,0,"""-""","""Mozilla/5.0 (Linux; Android 7.…",0
"""5.127.220.71""",2019-01-26 16:59:13 UTC,"""GET""",404,32420,"""-""","""MobileSafari/604.1 CFNetwork/9…",32420
"""5.213.7.50""",2019-01-26 16:59:13 UTC,"""GET""",200,20959,"""-""","""Mozilla/5.0 (iPhone; CPU iPhon…",20959
"""109.125.169.52""",2019-01-26 16:59:13 UTC,"""GET""",200,5,"""-""","""Mozilla/5.0 (Windows NT 6.1; r…",5


In [68]:
df['Status_Code'].dtype

UInt16

In [74]:
df_sorted = df.sort('Status_Code')

df_sorted.select(
    pl.col('Request').search_sorted("POST").alias('GET'),
    pl.col('Status_Code').search_sorted(404).alias('Status_404'),
    pl.col('Status_Code').search_sorted(200).alias('Status_200')
)

GET,Status_404,Status_200
u32,u32,u32
10364993,10194011,1012


In [84]:
df_sorted.row(10194010)

('206.189.101.79',
 datetime.datetime(2019, 1, 26, 16, 58, 18, tzinfo=zoneinfo.ZoneInfo(key='UTC')),
 'GET',
 403,
 189,
 '-',
 'Mozilla/5.0 (Linux; Android 7.1.1; SAMSUNG SM-J250F Build/NMF26X) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/8.2 Chrome/63.0.3239.111 Mobile Safari/537.36')

In [82]:
df_sorted

IP,Date,Request,Status_Code,Response_Size,Referer,User_Agent
str,"datetime[μs, UTC]",str,u16,u32,str,str
"""188.25.76.139""",2019-01-22 00:29:19 UTC,"""GET""",,,"""-""","""Mozilla/5.0 (compatible; MSIE …"
"""93.113.124.199""",2019-01-22 00:30:06 UTC,"""GET""",,,"""-""","""\x22nlpproject.info research\x…"
"""208.80.194.31""",2019-01-22 00:42:55 UTC,"""GET""",,,"""-""","""Mozilla/5.0 (Windows NT 5.1) G…"
"""208.80.194.31""",2019-01-22 00:42:57 UTC,"""GET""",,,"""-""","""Mozilla/5.0 (Windows NT 5.1) G…"
"""207.241.231.165""",2019-01-22 00:55:10 UTC,"""GET""",,,"""-""","""Mozilla/5.0 (compatible; archi…"
…,…,…,…,…,…,…
"""66.249.66.91""",2019-01-26 13:41:36 UTC,"""GET""",504,570,"""-""","""Mozilla/5.0 (compatible; Googl…"
"""66.249.66.91""",2019-01-26 14:07:41 UTC,"""GET""",504,570,"""-""","""Mozilla/5.0 (compatible; Googl…"
"""66.249.66.91""",2019-01-26 14:09:13 UTC,"""GET""",504,570,"""-""","""Mozilla/5.0 (compatible; Googl…"
"""66.249.66.93""",2019-01-26 14:31:14 UTC,"""GET""",504,570,"""-""","""Mozilla/5.0 (compatible; Googl…"


In [91]:
df.select(
    pl.col('Status_Code').value_counts().struct.rename_fields(
        ['Status_Code', 'Count']
    ).alias('Split')
).unnest('Split')

Status_Code,Count
u16,u32
304,340228
499,50852
504,103
401,323
400,525
…,…
404,105004
301,67121
500,14266
408,112


In [102]:
df.select(
    pl.col('Status_Code').unique_counts()
)

Status_Code
u32
9579390
105004
199784
67121
340228
…
103
112
323
6


In [107]:
df.with_columns(
    all=pl.all_horizontal([pl.col('Status_Code').is_null(), pl.col('Response_Size').is_null()]).alias('All_Null'),
)

IP,Date,Request,Status_Code,Response_Size,Referer,User_Agent,all
str,"datetime[μs, UTC]",str,u16,u32,str,str,bool
"""54.36.149.41""",2019-01-22 00:26:14 UTC,"""GET""",200,30577,"""-""","""Mozilla/5.0 (compatible; Ahref…",false
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5667,"""-""","""Mozilla/5.0 (Linux; Android 6.…",false
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5379,"""-""","""Mozilla/5.0 (Linux; Android 6.…",false
"""40.77.167.129""",2019-01-22 00:26:17 UTC,"""GET""",200,1696,"""-""","""Mozilla/5.0 (compatible; bingb…",false
"""91.99.72.15""",2019-01-22 00:26:17 UTC,"""GET""",200,41483,"""-""","""Mozilla/5.0 (Windows NT 6.2; W…",false
…,…,…,…,…,…,…,…
"""188.229.21.56""",2019-01-26 16:59:13 UTC,"""GET""",302,0,"""-""","""Mozilla/5.0 (Linux; Android 7.…",false
"""5.127.220.71""",2019-01-26 16:59:13 UTC,"""GET""",404,32420,"""-""","""MobileSafari/604.1 CFNetwork/9…",false
"""5.213.7.50""",2019-01-26 16:59:13 UTC,"""GET""",200,20959,"""-""","""Mozilla/5.0 (iPhone; CPU iPhon…",false
"""109.125.169.52""",2019-01-26 16:59:13 UTC,"""GET""",200,5,"""-""","""Mozilla/5.0 (Windows NT 6.1; r…",false


In [117]:
df.with_columns(
    pl.business_day_count(
        pl.col('Date').min().dt.date(),
        pl.col('Date').max().dt.date()
    ).alias('Business_Day_Count'),
)

IP,Date,Request,Status_Code,Response_Size,Referer,User_Agent,Business_Day_Count
str,"datetime[μs, UTC]",str,u16,u32,str,str,i32
"""54.36.149.41""",2019-01-22 00:26:14 UTC,"""GET""",200,30577,"""-""","""Mozilla/5.0 (compatible; Ahref…",4
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5667,"""-""","""Mozilla/5.0 (Linux; Android 6.…",4
"""31.56.96.51""",2019-01-22 00:26:16 UTC,"""GET""",200,5379,"""-""","""Mozilla/5.0 (Linux; Android 6.…",4
"""40.77.167.129""",2019-01-22 00:26:17 UTC,"""GET""",200,1696,"""-""","""Mozilla/5.0 (compatible; bingb…",4
"""91.99.72.15""",2019-01-22 00:26:17 UTC,"""GET""",200,41483,"""-""","""Mozilla/5.0 (Windows NT 6.2; W…",4
…,…,…,…,…,…,…,…
"""188.229.21.56""",2019-01-26 16:59:13 UTC,"""GET""",302,0,"""-""","""Mozilla/5.0 (Linux; Android 7.…",4
"""5.127.220.71""",2019-01-26 16:59:13 UTC,"""GET""",404,32420,"""-""","""MobileSafari/604.1 CFNetwork/9…",4
"""5.213.7.50""",2019-01-26 16:59:13 UTC,"""GET""",200,20959,"""-""","""Mozilla/5.0 (iPhone; CPU iPhon…",4
"""109.125.169.52""",2019-01-26 16:59:13 UTC,"""GET""",200,5,"""-""","""Mozilla/5.0 (Windows NT 6.1; r…",4


In [120]:
df.select(
    pl.implode('Request', 'Status_Code')
)

Request,Status_Code
list[str],list[u16]
"[""GET"", ""GET"", … ""GET""]","[200, 200, … 200]"


In [123]:
df.select(
    pl.int_ranges(0, 10).alias('Int_Ranges')
)

Int_Ranges
list[i64]
"[0, 1, … 9]"


In [127]:
df.select(
    pl.nth(3)
)

Status_Code
u16
200
200
200
200
200
…
302
404
200
200
