In [1]:
import os

os.chdir("..")

In [2]:
import polars as pl

from src.constants import TRAIN_SAMPLE_SIZE
from src.datatypes import BaseSchema, TrainSchema, filepaths

dependencies. Please install pandas as part of your environment's
dependencies or install the pandas extra with:

```bash
pip install pandas pandera

# or
pip install 'pandera[pandas]'
```



In [3]:
# Alias
S = TrainSchema

train_schema = BaseSchema.__annotations__ | TrainSchema.__annotations__

# Load only first 10 million rows subset

In [4]:
data = TrainSchema.validate(
    pl.scan_parquet(filepaths.train_unique, schema=train_schema).head(TRAIN_SAMPLE_SIZE)
).collect()
data_lf = data.lazy()

In [5]:
data.describe()

statistic,ip,app,device,os,channel,click_time,attributed_time,is_attributed
str,f64,f64,f64,f64,f64,str,str,f64
"""count""",10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,"""10000000""","""25201""",10000000.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0""","""9974799""",0.0
"""mean""",90824.024609,12.020038,22.144052,22.782861,269.257614,"""2017-11-08 06:11:25.904391""","""2017-11-08 07:31:28.115432""",0.0025201
"""std""",69385.4866,14.908403,261.816785,55.71075,130.170144,,,
"""min""",1.0,0.0,0.0,0.0,0.0,"""2017-11-06 16:00:01""","""2017-11-06 16:01:13""",0.0
"""25%""",40216.0,3.0,1.0,13.0,140.0,"""2017-11-07 11:07:03""","""2017-11-07 11:51:31""",
"""50%""",79665.0,12.0,1.0,18.0,258.0,"""2017-11-08 06:48:36""","""2017-11-08 08:02:57""",
"""75%""",118252.0,15.0,1.0,19.0,386.0,"""2017-11-09 01:53:50""","""2017-11-09 04:14:50""",
"""max""",364776.0,768.0,4227.0,914.0,498.0,"""2017-11-09 16:00:00""","""2017-11-09 15:59:51""",1.0


In [6]:
(
    data_lf
    .select(
        pl.col(S.attributed_time).is_not_null().sum()
    )
    .with_columns(
        S.attributed_time,
        **{
        f"{S.attributed_time}_relative": pl.col(S.attributed_time).truediv(
            data_lf.select(pl.len()).collect().row(0)[0]
            ),
    })
).collect()

attributed_time,attributed_time_relative
u32,f64
25201,0.0025201


Column `attributed_time` is like a metalabel column: it has 1 correlation with label and gives additional information. It will be dropped during training, but it can be useful in EDA.

In [7]:
for _col in train_schema:
    display(
        data_lf.select(
            **{
                _col: pl.col(_col).value_counts(parallel=True, sort=True),
                f"{_col}_relative": pl.col(_col).value_counts(parallel=True, sort=True, normalize=True),
            }
        ).collect()
    )
    print()

ip,ip_relative
struct[2],struct[2]
"{5348,67027}","{5348,0.0067027}"
"{5314,62620}","{5314,0.006262}"
"{73516,41777}","{73516,0.0041777}"
"{73487,41591}","{73487,0.0041591}"
"{53454,27054}","{53454,0.0027054}"
…,…
"{146547,1}","{146547,0.0000001}"
"{131889,1}","{131889,0.0000001}"
"{152610,1}","{152610,0.0000001}"
"{203274,1}","{203274,0.0000001}"





app,app_relative
struct[2],struct[2]
"{3,1822553}","{3,0.1822553}"
"{12,1290593}","{12,0.1290593}"
"{2,1183491}","{2,0.1183491}"
"{9,896596}","{9,0.0896596}"
"{15,878264}","{15,0.0878264}"
…,…
"{423,1}","{423,0.0000001}"
"{627,1}","{627,0.0000001}"
"{371,1}","{371,0.0000001}"
"{367,1}","{367,0.0000001}"





device,device_relative
struct[2],struct[2]
"{1,9428456}","{1,0.9428456}"
"{2,436271}","{2,0.0436271}"
"{0,56370}","{0,0.005637}"
"{3032,38610}","{3032,0.003861}"
"{3543,14598}","{3543,0.0014598}"
…,…
"{2326,1}","{2326,0.0000001}"
"{1726,1}","{1726,0.0000001}"
"{3161,1}","{3161,0.0000001}"
"{1255,1}","{1255,0.0000001}"





os,os_relative
struct[2],struct[2]
"{19,2395400}","{19,0.23954}"
"{13,2143071}","{13,0.2143071}"
"{17,520010}","{17,0.052001}"
"{18,485176}","{18,0.0485176}"
"{22,401811}","{22,0.0401811}"
…,…
"{261,1}","{261,0.0000001}"
"{819,1}","{819,0.0000001}"
"{248,1}","{248,0.0000001}"
"{893,1}","{893,0.0000001}"





channel,channel_relative
struct[2],struct[2]
"{280,786568}","{280,0.0786568}"
"{245,485363}","{245,0.0485363}"
"{107,460339}","{107,0.0460339}"
"{477,393183}","{477,0.0393183}"
"{134,322618}","{134,0.0322618}"
…,…
"{281,4}","{281,0.0000004}"
"{233,3}","{233,0.0000003}"
"{165,1}","{165,0.0000001}"
"{14,1}","{14,0.0000001}"





click_time,click_time_relative
struct[2],struct[2]
"{2017-11-09 04:52:25,108}","{2017-11-09 04:52:25,0.0000108}"
"{2017-11-08 13:00:30,106}","{2017-11-08 13:00:30,0.0000106}"
"{2017-11-09 11:00:12,106}","{2017-11-09 11:00:12,0.0000106}"
"{2017-11-09 10:00:10,106}","{2017-11-09 10:00:10,0.0000106}"
"{2017-11-08 02:36:15,105}","{2017-11-08 02:36:15,0.0000105}"
…,…
"{2017-11-07 20:12:16,1}","{2017-11-07 20:12:16,0.0000001}"
"{2017-11-07 20:12:36,1}","{2017-11-07 20:12:36,0.0000001}"
"{2017-11-08 08:18:57,1}","{2017-11-08 08:18:57,0.0000001}"
"{2017-11-09 11:59:18,1}","{2017-11-09 11:59:18,0.0000001}"





attributed_time,attributed_time_relative
struct[2],struct[2]
"{null,9974799}","{null,0.9974799}"
"{2017-11-07 12:21:06,4}","{2017-11-07 12:21:06,0.0000004}"
"{2017-11-08 07:46:32,4}","{2017-11-08 07:46:32,0.0000004}"
"{2017-11-09 10:19:53,4}","{2017-11-09 10:19:53,0.0000004}"
"{2017-11-07 10:32:42,4}","{2017-11-07 10:32:42,0.0000004}"
…,…
"{2017-11-07 07:30:15,1}","{2017-11-07 07:30:15,0.0000001}"
"{2017-11-07 06:11:52,1}","{2017-11-07 06:11:52,0.0000001}"
"{2017-11-07 10:05:56,1}","{2017-11-07 10:05:56,0.0000001}"
"{2017-11-07 10:37:05,1}","{2017-11-07 10:37:05,0.0000001}"





is_attributed,is_attributed_relative
struct[2],struct[2]
"{false,9974799}","{false,0.9974799}"
"{true,25201}","{true,0.0025201}"





The dataset is highly imbalanced - positive class is only 0.25201%.