In [1]:
# data.sink_parquet(train_fp_out, compression='uncompressed')

In [2]:
import os

os.chdir("..")

In [3]:
from pathlib import Path

import polars as pl

from src.datatypes import BaseSchema, Filepaths, TestSchema, TrainSchema

In [4]:
filepaths = Filepaths()

# Alias
TS = TrainSchema

train_schema = BaseSchema.__annotations__ | TrainSchema.__annotations__
test_schema = BaseSchema.__annotations__  | TestSchema.__annotations__

In [5]:
# Save test data to parquet
test_fp_in = Path("~/Projects/AdTracking_Fraud_Detection/data/test.csv")
test_fp_out = Path("~/Projects/AdTracking_Fraud_Detection/data/test.parquet")

test_csv = pl.read_csv(test_fp_in)
test_csv = test_csv.with_columns(pl.col(TS.click_time).str.to_datetime())
test_csv = test_csv.cast(test_schema)
test_csv.write_parquet(test_fp_out, compression='uncompressed')

In [6]:
data = TrainSchema.validate(pl.scan_parquet(filepaths.train, schema=train_schema, cast_options=pl.ScanCastOptions(integer_cast='upcast')))
test = TestSchema.validate(pl.scan_parquet(filepaths.test))

n_rows: int = data.select(pl.len()).collect()["len"][0]
print(f"{n_rows = }")

n_rows_test: int = test.select(pl.len()).collect()["len"][0]
print(f"{n_rows_test = }")
print()

n_null_values = data.null_count().collect()
print(f"{n_null_values = }")
print()

n_unique_rows: int = data.unique().select(pl.len()).collect(engine="streaming")["len"][0]
n_duplicates: int = n_rows - n_unique_rows

print(f"{n_duplicates = }")

n_rows = 184903890
n_rows_test = 18790469

n_null_values = shape: (1, 8)
┌─────┬─────┬────────┬─────┬─────────┬────────────┬─────────────────┬───────────────┐
│ ip  ┆ app ┆ device ┆ os  ┆ channel ┆ click_time ┆ attributed_time ┆ is_attributed │
│ --- ┆ --- ┆ ---    ┆ --- ┆ ---     ┆ ---        ┆ ---             ┆ ---           │
│ u32 ┆ u32 ┆ u32    ┆ u32 ┆ u32     ┆ u32        ┆ u32             ┆ u32           │
╞═════╪═════╪════════╪═════╪═════════╪════════════╪═════════════════╪═══════════════╡
│ 0   ┆ 0   ┆ 0      ┆ 0   ┆ 0       ┆ 0          ┆ 184447044       ┆ 0             │
└─────┴─────┴────────┴─────┴─────────┴────────────┴─────────────────┴───────────────┘

n_duplicates = 3619254


Drop duplicate rows

In [None]:
data = data.unique()
data.sink_parquet(filepaths.train_unique, compression='uncompressed')

# Is test set in the future?

In [None]:
max_click_time_train = data.select(pl.col(TS.click_time).max())
print(f"{max_click_time_train = }")
print()

max_click_time_test = test.select(pl.col(TS.click_time).min())
print(f"{max_click_time_test = }")
print()

is_test_set_in_future = (max_click_time_test > max_click_time_train).row(0)[0]
print(f"{is_test_set_in_future = }")

max_click_time_train = shape: (1, 1)
┌─────────────────────┐
│ click_time          │
│ ---                 │
│ datetime[μs]        │
╞═════════════════════╡
│ 2017-11-09 16:00:00 │
└─────────────────────┘

max_click_time_test = shape: (1, 1)
┌─────────────────────┐
│ click_time          │
│ ---                 │
│ datetime[μs]        │
╞═════════════════════╡
│ 2017-11-10 04:00:00 │
└─────────────────────┘

is_test_set_in_future = True


Yes, indeed. Based on this piece of info train & evaluation set will be split.