In [1]:
import polars as pl
import polars_ds as pds
import numpy as np

# This notebook illustrates the basic usage of this package

You need to create an environment with this package installed to run this notebook. (usually latest version)

# Num Extensions

In [2]:
size = 10_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "time_idx": range(size)
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "actual": np.round(np.random.random(size=size)).astype(np.int32)
    , "predicted": np.random.random(size=size)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
}).with_columns(
    pds.random(0., 1.).alias("x1")
    , pds.random(0., 1.).alias("x2")
    , pds.random(0., 1.).alias("x3")
    , pds.random(0., 1.).alias("a")
    , pds.random(0., 1.).alias("b")
).with_columns(
    y = pl.col("x1") * 0.15 + pl.col("x2") * 0.3 - pl.col("x3") * 1.5 + pds.random() * 0.0001
)
df.head()

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64
0.0,0,"""a""",1,0.892165,"""a""",0.911793,0.695128,0.094218,0.818295,0.815966,0.203999
0.841471,1,"""a""",1,0.948313,"""a""",0.125725,0.956351,0.776309,0.325662,0.589716,-0.858681
0.909297,2,"""a""",1,0.98999,"""a""",0.621217,0.93351,0.002152,0.661189,0.483786,0.370057
0.14112,3,"""a""",1,0.063476,"""a""",0.55396,0.868633,0.533063,0.781415,0.392511,-0.455826
-0.756802,4,"""a""",0,0.337291,"""a""",0.107261,0.288737,0.465019,0.036417,0.10985,-0.594726


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pds.query_jaccard_col("x1", pl.col("x2"))
)

x1
f64
0.0


In [4]:
# FFT. First is real part, second is complex part
# By default, this behaves the same as np's rfft, which returns a non-redundant 
# compact representation of fft output.
df.select(
    pds.rfft("f")
).head()

f
"array[f64, 2]"
"[1.939505, 0.0]"
"[1.939506, 0.000209]"
"[1.939508, 0.000418]"
"[1.939512, 0.000627]"
"[1.939518, 0.000835]"


In [5]:
# FFT. But return the full length
df.select(
    pds.rfft("f", return_full=True)
).shape

(10000, 1)

In [6]:
# Multiple Convolutions at once
# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`
# Method: `fft`, `direct`
# Currently slower than SciPy but provides parallelism because of Polars
df.select(
    pds.convolve("f", [-1, 0, 0, 0, 1], mode = "full", method = "fft"), # column f with the kernel given here
    pds.convolve("a", [-1, 0, 0, 0, 1], mode = "full", method = "direct"),
    pds.convolve("b", [-1, 0, 0, 0, 1], mode = "full", method = "direct"),
).head()

f,a,b
f64,f64,f64
-4.2882e-16,-0.818295,-0.815966
-0.841471,-0.325662,-0.589716
-0.909297,-0.661189,-0.483786
-0.14112,-0.781415,-0.392511
0.756802,0.781878,0.706116


In [7]:
# Least Square (Linear Regression)
df.select(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    )
)

y
list[f64]
"[-0.490017, -0.355404]"


In [8]:
df.select(
    pds.query_lstsq_report(
        # formulaic input is also available for lstsq related queries, 
        # or you can always use polars expressions, e.g. pl.col('x1') + 1, pl.col('x2').exp(), pl.col('x3').sin()
        "ln(x1+1)", "exp(x2)", "sin(x3)",
        target = "y",
        add_bias = True
    ).alias("report")
).unnest("report")

features,beta,std_err,t,p>|t|,0.025,0.975
str,f64,f64,f64,f64,f64,f64
"""ln(x1+1)""",0.219844,0.001645,133.680425,0.0,0.216621,0.223068
"""exp(x2)""",0.175615,0.000671,261.549471,0.0,0.174299,0.176931
"""sin(x3)""",-1.743993,0.001334,-1306.99375,0.0,-1.746608,-1.741377
"""__bias__""",-0.109299,0.001498,-72.974218,0.0,-0.112235,-0.106363


In [9]:
df.lazy().select(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = "y", # We can either put pl.col("y") here or just the string "y"
        add_bias=False
    )
).collect()

y
list[f64]
"[-0.490017, -0.355404]"


In [10]:
df.select(
    "dummy",
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    ).over(pl.col("dummy"))
)

dummy,coeffs
str,list[f64]
"""a""","[-0.491637, -0.363114]"
"""a""","[-0.491637, -0.363114]"
"""a""","[-0.491637, -0.363114]"
"""a""","[-0.491637, -0.363114]"
"""a""","[-0.491637, -0.363114]"
…,…
"""b""","[-0.488274, -0.347916]"
"""b""","[-0.488274, -0.347916]"
"""b""","[-0.488274, -0.347916]"
"""b""","[-0.488274, -0.347916]"


In [11]:
# If you want prediction and residue instead of coefficients
df.select(
    "x1",
    "x2",
    "y",
    pds.query_lstsq(
        "x1", pl.col("x2"),
        target = "y",
        add_bias=False, 
        return_pred=True
    ).alias("prediction")
).unnest("prediction").head()

x1,x2,y,pred,resid
f64,f64,f64,f64,f64
0.911793,0.695128,0.203999,-0.693846,0.897845
0.125725,0.956351,-0.858681,-0.401499,-0.457183
0.621217,0.93351,0.370057,-0.636181,1.006238
0.55396,0.868633,-0.455826,-0.580166,0.124339
0.107261,0.288737,-0.594726,-0.155178,-0.439548


In [12]:
df.group_by("dummy").agg(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    )
)

dummy,coeffs
str,list[f64]
"""a""","[-0.491637, -0.363114]"
"""b""","[-0.488274, -0.347916]"


In [13]:
# Lasso
df.group_by("dummy").agg(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        l1_reg = 0.1,
        add_bias=False
    )
)

dummy,coeffs
str,list[f64]
"""a""","[-0.327664, -0.184011]"
"""b""","[-0.318486, -0.177007]"


In [14]:
# R2 metric of lasso regressions on each group
df.group_by("dummy").agg(
    pds.query_r2(
        actual = pl.col("y"),
        pred = pds.query_lstsq(
            pl.col("x1"), pl.col("x2"),
            target = pl.col("y"),
            l1_reg = 0.1,
            return_pred = True,
            add_bias=False
        ).struct.field("pred")
    ).alias("lasso_r2")
)

dummy,lasso_r2
str,f64
"""a""",-0.533136
"""b""",-0.548727


In [15]:
# Rolling regression
df.select(
    "y",
    "x1",
    "x2",
    pds.query_rolling_lstsq(
        "x1", "x2",
        target = "y",
        window_size = 5,
        null_policy = "zero"
    ).alias("result")
).unnest("result")

y,x1,x2,coeffs,pred
f64,f64,f64,list[f64],f64
0.203999,0.911793,0.695128,,
-0.858681,0.125725,0.956351,,
0.370057,0.621217,0.93351,,
-0.455826,0.55396,0.868633,,
-0.594726,0.107261,0.288737,"[1.160027, -0.979407]",-0.158365
…,…,…,…,…
0.220408,0.230911,0.658026,"[-1.13021, 0.56658]",0.111846
-0.994365,0.889447,0.924552,"[-0.924553, -0.002686]",-0.824824
-1.455982,0.207659,0.020715,"[-1.495076, 0.657265]",-0.29685
-0.98038,0.939537,0.244478,"[-1.675845, 0.691202]",-1.405534


In [16]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pds.query_cond_entropy("y", "x1")
)

y
f64
-0.0


In [17]:
# Only want singular values (principal values?)
df.select(
    pds.query_singular_values("a", "b", "x1")
)

a
list[f64]
"[29.313388, 28.98794, 28.657594]"


In [18]:
# Singular values + The principal components
df.select(
    pds.query_pca("a", "b")
).unnest("a")

singular_value,weight_vector
f64,list[f64]
29.122366,"[-0.462112, 0.886822]"
28.657989,"[0.886822, 0.462112]"


In [19]:
# PC1
df.select(
    pds.query_principal_components("a", "b", k =1)
).unnest("principal_components").head()

pc1
f64
0.133943
0.160952
-0.08804
-0.224543
-0.130941


# ML Metrics

In [20]:
df.group_by("dummy_groups").agg(
    pds.query_l2("actual", "predicted").alias("l2"),
    pds.query_log_loss("actual", "predicted").alias("log loss"),
    pds.query_binary_metrics(actual="actual", pred="predicted").alias("combo")
).unnest("combo")


dummy_groups,l2,log loss,precision,recall,f,average_precision,roc_auc
str,f64,f64,f64,f64,f64,f64,f64
"""b""",0.332189,0.99867,0.496745,0.491744,0.494232,0.503049,0.50233
"""a""",0.343686,1.036797,0.491823,0.488317,0.490064,0.491171,0.480539


# Str Extension

In [21]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

sen,word
str,str
"""Hello, world! I'm going to chu…","""words"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""words"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""words"""


In [22]:
# Tokenize
df2.select(
    pds.str_tokenize(pl.col("sen").str.to_lowercase()).explode().unique()
)

sen
str
"""world"""
"""hello"""
"""going"""
"""to"""
"""church"""


In [23]:
df2.select(
    pds.str_tokenize(pl.col("sen").str.to_lowercase(), stem=True).explode().unique()
)

sen
str
"""go"""
""""""
"""hello"""
"""church"""
"""world"""


In [24]:
df2.select(
    pds.str_leven("word", pl.lit("world"))
).head()

word
u32
2
1
2
1
2


In [25]:
# Damerau-Levenshtein
df2.select(
    pds.str_d_leven("word", pl.lit("world"))
).head()

word
u32
2
1
2
1
2


In [26]:
df2.select( # column "word" vs. the word "world"
    pds.str_leven("word", pl.lit("world"), return_sim = True)
).head()

word
f64
0.6
0.8
0.6
0.8
0.6


In [27]:
df2.filter(
    # This is way faster than computing ditance and then doing a filter
    pds.filter_by_levenshtein(pl.col("word"), pl.lit("world"), 1) # <= 1. 
).head()

sen,word
str,str
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""


In [28]:
df = pl.DataFrame({
    "word":["apple", "banana", "pineapple", "asasasas", "sasasass"],
    "other_data": [1,2,3,4,5]
})
gibberish = ["asasasa", "sasaaasss", "asdasadadfa"]

In [29]:
df.filter(
    pds.similar_to_vocab(
        pl.col("word"),
        vocab = gibberish,
        threshold = 0.5,
        metric = "lv", # Levenshtein similarity. Other options: dleven, osa, jw
        strategy = "any" # True if the word is similar to any word in vocab. Other options: "all", "avg"
    )
)

word,other_data
str,i64
"""asasasas""",4
"""sasasass""",5


In [30]:
df.select(
    pds.str_leven("word", pl.lit("asasasa"), return_sim=True).alias("asasasa"),
    pds.str_leven("word", pl.lit("sasaaasss"), return_sim=True).alias("sasaaasss"),
    pds.str_leven("word", pl.lit("asdasadadfa"), return_sim=True).alias("asdasadadfa"),
    pds.str_fuzz("word", pl.lit("apples")).alias("LCS based Fuzz match - apples"),
    pds.str_osa("word", pl.lit("apples"), return_sim=True).alias("Optimal String Alignment - apples"),
    pds.str_jw("word", pl.lit("apples")).alias("Jaro-Winkler - apples"),
)


asasasa,sasaaasss,asdasadadfa,LCS based Fuzz match - apples,Optimal String Alignment - apples,Jaro-Winkler - apples
f64,f64,f64,f64,f64,f64
0.142857,0.111111,0.090909,0.833333,0.833333,0.966667
0.428571,0.333333,0.272727,0.166667,0.0,0.444444
0.111111,0.111111,0.090909,0.555556,0.444444,0.5
0.875,0.666667,0.545455,0.25,0.25,0.527778
0.75,0.777778,0.454545,0.25,0.25,0.527778


# Stats Extension

In [31]:
import numpy as np

df = pl.DataFrame({
    "a": [None, None] + list(np.random.normal(size = 998))
})
df.head()

a
f64
""
""
-0.063853
0.589552
-0.946684


In [32]:
# Genenrate random numbers, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pds.random_normal(mean = 0.5, std = 1.0).alias("random_normal"),
    pl.when(pl.col("a").is_null()).then(None).otherwise(
        pds.random_normal(mean = 0.5, std = 1.0).alias("random_normal")
    ).alias("random_normal_that_respects_null_of_a")
).head()

a,random_normal,random_normal_that_respects_null_of_a
f64,f64,f64
,2.515477,
,-1.507373,
-0.063853,0.132507,0.784535
0.589552,-1.125048,1.717568
-0.946684,1.346741,0.454957


In [33]:
# Genenrate random string
df.with_columns(
    pds.random_str(min_size = 1, max_size = 5).alias("random_str"),
    pl.when(pl.col("a").is_null()).then(None).otherwise(
        pds.random_str(min_size = 1, max_size = 5)
    ).alias("random_str_that_respects_null_of_a")
).head()

a,random_str,random_str_that_respects_null_of_a
f64,str,str
,"""p9q""",
,"""R1UZ""",
-0.063853,"""q5Vog""","""o"""
0.589552,"""vQMM""","""AH"""
-0.946684,"""Ydh""","""0"""


In [34]:
# Genenrate fixed size random string, while respecting column a's nulls
df.with_columns(
    pl.when(pl.col("a").is_null()).then(None).otherwise(
        pds.random_str(min_size = 5, max_size = 5)
    ).alias("random_str")
).head()

a,random_str
f64,str
,
,
-0.063853,"""qhb5I"""
0.589552,"""Dfmvg"""
-0.946684,"""sL1xx"""


In [35]:
df.with_columns(
    # Sample from a normal distribution, using reference column "a" 's mean and std
    pds.random_normal(pl.col("a").mean(), pl.col("a").std()).alias("test1") 
    # Sample from uniform distribution, with low = 0 and high = "a"'s max, and respect the nulls in "a"
    , pl.when(pl.col("a").is_null()).then(None).otherwise(
        pds.random(lower = 0., upper = pl.col("a").max()).alias("test2")
    )
).with_columns(
    # Add a random pertubation to test1
    pds.perturb("test1", epsilon=0.001).alias("test1_perturbed")
).head()

a,test1,literal,test1_perturbed
f64,f64,f64,f64
,0.278036,,0.277542
,1.817391,,1.817711
-0.063853,-0.498464,0.833706,-0.498104
0.589552,-0.976279,1.275155,-0.976284
-0.946684,-0.750324,2.289387,-0.750434


In [36]:
# New in v0.3.5
# This way, we don't have a reference column, so we cannot respect nulls, but is more convenient to use.
df.with_columns(
    pds.random().alias("[0, 1)"),
    pds.random_normal(pl.col("a").mean(), pl.col("a").std()).alias("Normal"),
    pds.random_int(0, 10).alias("Int from [0, 10)"),
).head()

a,"[0, 1)",Normal,"Int from [0, 10)"
f64,f64,f64,i32
,0.484578,-0.373655,0
,0.096912,-0.271684,5
-0.063853,0.380539,-1.088371,7
0.589552,0.112595,-1.894823,4
-0.946684,0.770406,-1.255929,4


In [37]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed 

df.with_columns(
    pds.random_normal(0.5, 1.0).alias("test1"),
    pds.random_normal(0.5, 2.0).alias("test2"),
).select(
    pds.query_ttest_ind("test1", "test2", equal_var=False).alias("t-test"),
    pds.normal_test("test1").alias("normality_test")
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics")
    , pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue")
    , pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics")
    , pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue")
)

t-tests: statistics,t-tests: pvalue,normality_test: statistics,normality_test: pvalue
f64,f64,f64,f64
-0.109282,0.912994,2.311937,0.314753


In [38]:
size = 5_000
df = pl.DataFrame({
    "market_id": range(size),
}).with_columns(
    pl.col("market_id").mod(3),
    var1 = pds.random(),
    var2 = pds.random(),
    category_1 = pds.random_int(0, 5),
    category_2 = pds.random_int(0, 10),
)

df.head(5)

market_id,var1,var2,category_1,category_2
i64,f64,f64,i32,i32
0,0.8498,0.104762,2,7
1,0.835423,0.304521,3,2
2,0.451455,0.915635,2,1
0,0.198961,0.840353,0,7
1,0.066047,0.936805,2,3


In [39]:
# In dataframe statistical tests!
df.select(
    pds.query_ttest_ind("var1", "var2", equal_var=True).alias("t-test"),
    pds.query_chi2("category_1", "category_2").alias("chi2-test"),
    pds.query_f_test("var1", group = "category_1").alias("f-test")
)

t-test,chi2-test,f-test
struct[2],struct[2],struct[2]
"{-0.64793,0.517045}","{22.170505,0.965739}","{0.50483,0.732206}"


In [40]:
# Can also be done in group by context
print(
    df.group_by("market_id").agg(
        pds.query_ttest_ind("var1", "var2", equal_var=False).alias("t-test"),
        pds.query_chi2("category_1", "category_2").alias("chi2-test"),
        pds.query_f_test("var1", group = "category_1").alias("f-test")
    )
)

shape: (3, 4)
┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐
│ market_id ┆ t-test               ┆ chi2-test            ┆ f-test              │
│ ---       ┆ ---                  ┆ ---                  ┆ ---                 │
│ i64       ┆ struct[2]            ┆ struct[2]            ┆ struct[2]           │
╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡
│ 0         ┆ {-1.04167,0.29764}   ┆ {27.019093,0.860273} ┆ {0.542672,0.70442}  │
│ 1         ┆ {-0.146552,0.883494} ┆ {37.653112,0.393468} ┆ {0.377929,0.824525} │
│ 2         ┆ {0.064001,0.948973}  ┆ {26.863301,0.865169} ┆ {0.05765,0.993836}  │
└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘


In [41]:
# Benford's law
df.select(
    first_digit_cnt = pds.query_first_digit_cnt(pl.col("var1")).explode()
).with_columns(
    # This doesn't follow benford's law because it is random data
    first_digit_distribution = pl.col("first_digit_cnt") / pl.col("first_digit_cnt").sum()
)

first_digit_cnt,first_digit_distribution
u32,f64
566,0.1132
585,0.117
591,0.1182
538,0.1076
553,0.1106
556,0.1112
556,0.1112
496,0.0992
559,0.1118


# Nearest Neighbors Related Tasks

These queries can be very slow when data/dimension gets huge, even when processed in parallel.

In [42]:
import polars_ds as pds
size = 2000
df = pl.DataFrame({
    "id": range(size), 
}).with_columns(
    pds.random().alias("var1"),
    pds.random().alias("var2"),
    pds.random().alias("var3"),
    pds.random().alias("r"),
    (pds.random() * 10).alias("rh"),
    pl.col("id").cast(pl.UInt32)
)

In [43]:
# Get neighbor count. The point itself is always considered a neighbor to itself.
df.with_columns(
    pds.query_nb_cnt(
        0.1, # radius 
        pl.col("var1"), "var2", "var3", # Columns used as the coordinates in n-d space, str | pl.Expr 
        dist = "inf", # L Infinity distance 
        parallel = True 
    ).alias("nb_l_inf_cnt")
).head() 

id,var1,var2,var3,r,rh,nb_l_inf_cnt
u32,f64,f64,f64,f64,f64,u32
0,0.690801,0.170121,0.441214,0.941438,6.092433,12
1,0.812247,0.095282,0.305064,0.698363,4.982886,13
2,0.887274,0.765451,0.395854,0.497346,0.99468,19
3,0.31295,0.12156,0.313221,0.850428,0.942336,17
4,0.792379,0.079288,0.809853,0.403986,1.952905,10


In [44]:
df.with_columns(
    pds.query_nb_cnt(
        pl.col("r"), # radius be an expression too
        "var1", "var2", "var3", # Columns used as the coordinates in n-d space, str | pl.Expr 
        dist = "l1", # L 1 distance 
        parallel = True 
    ).alias("nb_l1_r_cnt")
).head()

id,var1,var2,var3,r,rh,nb_l1_r_cnt
u32,f64,f64,f64,f64,f64,u32
0,0.690801,0.170121,0.441214,0.941438,6.092433,1142
1,0.812247,0.095282,0.305064,0.698363,4.982886,437
2,0.887274,0.765451,0.395854,0.497346,0.99468,226
3,0.31295,0.12156,0.313221,0.850428,0.942336,810
4,0.792379,0.079288,0.809853,0.403986,1.952905,112


In [45]:
# Get ids of the k nearest neighbors. 
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        index = "id",  # pl.col("id"), str | pl.Expr
        k = 3, 
        dist = "l2", # squared l2
        parallel = True
    ).alias("best friends")
).head() 

id,var1,var2,var3,r,rh,best friends
u32,f64,f64,f64,f64,f64,list[u32]
0,0.690801,0.170121,0.441214,0.941438,6.092433,"[0, 1302, … 1845]"
1,0.812247,0.095282,0.305064,0.698363,4.982886,"[1, 341, … 1938]"
2,0.887274,0.765451,0.395854,0.497346,0.99468,"[2, 1331, … 259]"
3,0.31295,0.12156,0.313221,0.850428,0.942336,"[3, 1198, … 1277]"
4,0.792379,0.079288,0.809853,0.403986,1.952905,"[4, 957, … 553]"


In [46]:
# Get all neighbors within radius r, call them best friends
print(

df.select(
    pl.col("id"),
    pds.query_radius_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in 3d space
        index = pl.col("id"),
        r = 0.1, 
        dist = "l2", # actually this is squared l2
        parallel = True
    ).alias("best friends"),
).with_columns( # -1 to remove the point itself
    (pl.col("best friends").list.len() - 1).alias("best friends count")
).head()

)

shape: (5, 3)
┌─────┬───────────────────┬────────────────────┐
│ id  ┆ best friends      ┆ best friends count │
│ --- ┆ ---               ┆ ---                │
│ u32 ┆ list[u32]         ┆ u32                │
╞═════╪═══════════════════╪════════════════════╡
│ 0   ┆ [0, 1302, … 1655] ┆ 5                  │
│ 1   ┆ [1, 341, … 700]   ┆ 8                  │
│ 2   ┆ [2, 1331, … 946]  ┆ 5                  │
│ 3   ┆ [3, 1198, … 1047] ┆ 7                  │
│ 4   ┆ [4, 957, … 246]   ┆ 4                  │
└─────┴───────────────────┴────────────────────┘


In [47]:
# Get ids of the k nearest neighbors and distances
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        index = pl.col("id"),
        k = 3, 
        dist = "l2", # actually this is squared l2
        parallel = True,
        return_dist = True
    ).alias("best_friends_w_dist")
).unnest("best_friends_w_dist").head()

id,var1,var2,var3,r,rh,idx,dist
u32,f64,f64,f64,f64,f64,list[u32],list[f64]
0,0.690801,0.170121,0.441214,0.941438,6.092433,"[0, 1302, … 1845]","[0.0, 0.0724, … 0.097051]"
1,0.812247,0.095282,0.305064,0.698363,4.982886,"[1, 341, … 1938]","[0.0, 0.040774, … 0.063491]"
2,0.887274,0.765451,0.395854,0.497346,0.99468,"[2, 1331, … 259]","[0.0, 0.053612, … 0.076934]"
3,0.31295,0.12156,0.313221,0.850428,0.942336,"[3, 1198, … 1277]","[0.0, 0.04743, … 0.065846]"
4,0.792379,0.079288,0.809853,0.403986,1.952905,"[4, 957, … 553]","[0.0, 0.062671, … 0.074585]"


In [48]:
# Filter to only points near the given point
df.filter(
    pds.within_dist_from(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        pt = [0.5, 0.5, 0.5],
        r = 0.2,
        dist = "l2" # actually this is squared l2, so this is asking for squared l2 <= 0.2
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
0,0.690801,0.170121,0.441214,0.941438,6.092433
5,0.488917,0.569547,0.719197,0.222572,8.093664
6,0.784901,0.599807,0.662587,0.083576,9.275924
7,0.655748,0.45577,0.552408,0.711763,8.581199
9,0.189803,0.336674,0.521146,0.3185,9.84388


In [49]:
# Haversine distance is available when dimension is 2
df.filter(
    pds.within_dist_from(
        pl.col("var1"), pl.col("var2"), # Columns used as the coordinates in n-d space
        pt = [0.5, 0.5],
        r = 10, # in km
        dist = "h" 
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
5,0.488917,0.569547,0.719197,0.222572,8.093664
25,0.552314,0.537143,0.228336,0.322478,7.359873
48,0.458231,0.538932,0.251232,0.424817,3.99862
54,0.507641,0.516375,0.473393,0.758425,0.310381
70,0.516725,0.547032,0.176468,0.00018,2.904882


In [50]:
df.filter(
    pds.within_dist_from(
        pl.col("var1"), pl.col("var2"), 
        pt = [0.5, 0.5],
        # radius can also be an existing column in the dataframe.
        r = pl.col("rh"), 
        dist = "h" 
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
5,0.488917,0.569547,0.719197,0.222572,8.093664
25,0.552314,0.537143,0.228336,0.322478,7.359873
224,0.496332,0.512851,0.016562,0.910587,9.142325
273,0.472758,0.420041,0.203852,0.26653,9.412762
540,0.484401,0.506627,0.86604,0.890258,6.919762


In [51]:
friends = df.select(
    pl.col("id").cast(pl.UInt64),
    pds.query_radius_ptwise(
        # Columns used as the coordinates in n-d space
        pl.col("var1"), pl.col("var2"), 
        index=pl.col("id"),
        r = 0.02, 
        dist = "l2",
    ).alias("friends")
).with_columns(
    pl.col("friends").list.len().alias("count")
)
friends.head()

id,friends,count
u64,list[u32],u32
0,"[0, 1728]",2
1,"[1, 827]",2
2,"[2, 679]",2
3,"[3, 1618, … 137]",4
4,"[4, 341, … 871]",4


# String Nearest Neighbors

This might be very slow for very large vocab / column.

In [52]:
df = pl.DataFrame({
    "a":["AAAAA", "ABCABC", "AAAADDD", "ADSDSDS", "WORD"],
    "b":["AAAAT", "ABCACD", "ADSSD", "APPLES", "WORLD"] 
})

In [53]:
# Use Levenshtein to find the nearest neighbor in vocab to word in column a
df.select(
    pds.query_similar_words(
        "a",
        vocab = pl.col("b"),
        k = 1, 
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
str
"""AAAAT"""
"""ABCACD"""
"""AAAAT"""
"""ADSSD"""
"""WORLD"""


In [54]:
# Use Levenshtein to find 2 nearest neighbors
df.select(
    pds.query_similar_words(
        "a",
        vocab = pl.col("b"),
        k = 2, 
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
list[str]
"[""AAAAT"", ""ADSSD""]"
"[""ABCACD"", ""AAAAT""]"
"[""AAAAT"", ""ABCACD""]"
"[""ADSSD"", ""APPLES""]"
"[""WORLD"", ""ADSSD""]"


In [55]:
# Currently only Levenshtein and hamming are implemented for this
# Empty means nothing in vocab can be compared in the hamming sense with the corresponding word in a
df.select(
    pds.query_similar_words(
        "a",
        vocab = pl.col("b"),
        k = 2, 
        threshold = 4,
        metric = "hamming"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
list[str]
"[""AAAAT"", ""ADSSD""]"
"[""ABCACD""]"
[]
[]
[]


In [56]:
# You may provide a vocab like this
df.select(
    pl.col("a"),
    pds.query_similar_words(
        "a",
        vocab = ["WORLD", "AAAAA", "ABCDEFG", "ZIV", "TQQQ"],
        k = 3, 
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

a,similar_words_from_vocab
str,list[str]
"""AAAAA""","[""AAAAA"", ""ZIV"", ""WORLD""]"
"""ABCABC""","[""ABCDEFG"", ""AAAAA"", ""ZIV""]"
"""AAAADDD""","[""AAAAA"", ""WORLD"", ""ABCDEFG""]"
"""ADSDSDS""","[""ABCDEFG"", ""WORLD"", ""AAAAA""]"
"""WORD""","[""WORLD"", ""ZIV"", ""TQQQ""]"


# Using PDS Expressions On Series / NumPy arrays

In [57]:
df = pds.frame(size=100_000).select(
    pds.random(0.0, 1.0).round().alias("actual"),
    pds.random(0.0, 1.0).alias("predicted"),
    pds.random_int(0, 3).alias("0-2"),
    pds.random_int(0, 10).alias("0-9"),
)
df.head()

actual,predicted,0-2,0-9
f64,f64,i32,i32
1.0,0.184754,0,0
0.0,0.332776,2,3
1.0,0.54619,2,2
1.0,0.41849,0,8
1.0,0.504009,1,0


In [58]:
pds.eval_series(
    df["0-2"], df["0-9"], # use series as args
    expr = "query_jaccard_col" # name of the pds expression
)

jaccard_col
f64
0.3


In [59]:
pds.eval_series(
    df["actual"], df["predicted"], # use series as args
    expr = "query_binary_metrics" # name of the pds expression
).unnest("binary_metrics")

precision,recall,f,average_precision,roc_auc
f64,f64,f64,f64,f64
0.498474,0.498293,0.498384,0.498171,0.500488


In [60]:
pds.eval_series(
    # can also use NumPy
    np.random.random(size = 1000), 
    np.random.random(size = 1000), 
    expr = "query_psi", # name of the pds expression
    n_bins = 5, 
    return_report = True
).unnest("psi")

cnt<=,baseline_pct,actual_pct,psi_bin
f64,f64,f64,f64
0.183927,0.2,0.182,0.001698
0.383886,0.2,0.23,0.004193
0.583203,0.2,0.185,0.001169
0.788215,0.2,0.195,0.000127
inf,0.2,0.208,0.000314


In [61]:
pds.eval_series(
    np.random.random(size = 1000), 
    expr = "query_cid_ce", # name of the pds expression
)

cid_ce
f64
12.759065


In [62]:
pds.eval_series(
    pl.Series(values=np.random.random(size = 1000)), 
    expr = "query_c3_stats", # name of the pds expression
    lag = 3
)

c3_stats
f64
0.123318


# Examples using Ball Tree Features

In [65]:
size = 1_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "time_idx": range(size)
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "actual": np.round(np.random.random(size=size)).astype(np.int32)
    , "predicted": np.random.random(size=size)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
}).with_columns(
    pds.random(0., 1.).alias("x1")
    , pds.random(0., 1.).alias("x2")
    , pds.random(0., 1.).alias("x3")
    , pds.random(0., 1.).alias("a")
    , pds.random(0., 1.).alias("b")
).with_columns(
    y = pl.col("x1") * 0.15 + pl.col("x2") * 0.3 - pl.col("x3") * 1.5 + pds.random() * 0.0001
)
df = df.with_columns(pl.int_range(0, size).cast(dtype=pl.UInt32).alias("index"))
df.head()

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4


In [66]:
# Pointwise Nearest Neighbors
df.with_columns(
    pds.query_bt_knn_ptwise(
        pl.col("x2"), 
        pl.col("x3"), 
        index=pl.col("index"),
        r=999.0,
        k =5,
        distance_metric="haversine",
        parallel=True).alias("ball_tree_knn_ptwise")
)

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index,ball_tree_knn_ptwise
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32,list[u32]
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0,"[0, 309, … 383]"
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1,"[1, 215, … 4]"
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2,"[2, 837, … 213]"
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3,"[3, 366, … 78]"
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4,"[4, 196, … 942]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.773833,995,"""b""",0,0.81024,"""b""",0.808594,0.798673,0.776158,0.799283,0.96809,-0.80334,995,"[995, 234, … 915]"
-0.114875,996,"""b""",0,0.435576,"""b""",0.182852,0.952257,0.663921,0.68566,0.079458,-0.682739,996,"[996, 490, … 827]"
-0.897967,997,"""b""",1,0.799476,"""b""",0.131082,0.389023,0.65403,0.669834,0.847043,-0.844666,997,"[997, 456, … 116]"
-0.855473,998,"""b""",1,0.12903,"""b""",0.794842,0.794882,0.297178,0.113799,0.990062,-0.088007,998,"[998, 769, … 321]"


In [68]:
# Pointwise Nearest Neighbors with distances
# we get a struct with an ids column and a dist column

df.with_columns(
    pds.query_bt_knn_ptwise(
        pl.col("x2"), 
        pl.col("x3"), 
        index=pl.col("index"),
        r=999.0,
        k =5,
        distance_metric="haversine",
        return_dist=True,
        parallel=True).alias("ball_tree_knn_ptwise")
).unnest("ball_tree_knn_ptwise")

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index,ids,distances
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32,list[u32],list[f64]
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0,"[0, 309, … 383]","[0.0, 0.227001, … 4.571094]"
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1,"[1, 215, … 4]","[0.0, 0.472495, … 3.953833]"
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2,"[2, 837, … 213]","[0.0, 0.507418, … 2.867486]"
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3,"[3, 366, … 78]","[0.0, 0.094053, … 3.338146]"
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4,"[4, 196, … 942]","[0.0, 2.545978, … 3.247336]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.773833,995,"""b""",0,0.81024,"""b""",0.808594,0.798673,0.776158,0.799283,0.96809,-0.80334,995,"[995, 234, … 915]","[0.0, 0.762144, … 3.949634]"
-0.114875,996,"""b""",0,0.435576,"""b""",0.182852,0.952257,0.663921,0.68566,0.079458,-0.682739,996,"[996, 490, … 827]","[0.0, 0.410204, … 3.064209]"
-0.897967,997,"""b""",1,0.799476,"""b""",0.131082,0.389023,0.65403,0.669834,0.847043,-0.844666,997,"[997, 456, … 116]","[0.0, 0.796457, … 4.244226]"
-0.855473,998,"""b""",1,0.12903,"""b""",0.794842,0.794882,0.297178,0.113799,0.990062,-0.088007,998,"[998, 769, … 321]","[0.0, 0.507682, … 2.901211]"


In [69]:
# Frequency Count
df.with_columns(
    pds.query_bt_knn_radius_freq_cnt(
        pl.col("x2"), 
        pl.col("x3"), 
        index=pl.col("index"), 
        r=999.0, 
        k =5, 
        distance_metric="haversine", 
        parallel=True
        ).alias("ball_tree_knn_radius_freq_cnt")
)

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index,ball_tree_knn_radius_freq_cnt
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32,struct[2]
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0,"{871,10}"
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1,"{701,10}"
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2,"{167,9}"
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3,"{362,9}"
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4,"{147,9}"
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.773833,995,"""b""",0,0.81024,"""b""",0.808594,0.798673,0.776158,0.799283,0.96809,-0.80334,995,"{492,1}"
-0.114875,996,"""b""",0,0.435576,"""b""",0.182852,0.952257,0.663921,0.68566,0.079458,-0.682739,996,"{600,1}"
-0.897967,997,"""b""",1,0.799476,"""b""",0.131082,0.389023,0.65403,0.669834,0.847043,-0.844666,997,"{610,1}"
-0.855473,998,"""b""",1,0.12903,"""b""",0.794842,0.794882,0.297178,0.113799,0.990062,-0.088007,998,"{828,1}"


In [74]:
# Average distance of k nearest neighbors
df.with_columns(
    pds.query_bt_knn_avg(
        pl.col("x2"), 
        pl.col("x3"), 
        pl.col("x1"), 
        index=pl.col("index"), 
        r=999.0, 
        k =1,
        distance_metric="euclidean", 
        parallel=True).alias("btree_knn_avg")
)

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index,btree_knn_avg
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32,f64
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0,131.048471
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1,428.336943
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2,51.713153
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3,280.478864
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4,108.016655
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.773833,995,"""b""",0,0.81024,"""b""",0.808594,0.798673,0.776158,0.799283,0.96809,-0.80334,995,837.903962
-0.114875,996,"""b""",0,0.435576,"""b""",0.182852,0.952257,0.663921,0.68566,0.079458,-0.682739,996,610.02722
-0.897967,997,"""b""",1,0.799476,"""b""",0.131082,0.389023,0.65403,0.669834,0.847043,-0.844666,997,723.917782
-0.855473,998,"""b""",1,0.12903,"""b""",0.794842,0.794882,0.297178,0.113799,0.990062,-0.088007,998,589.166151


In [75]:
# Count neighbors within radius
df.with_columns(
    pds.query_bt_nb_cnt(
        pl.col("x2"),
        pl.col("x3"),
        index=pl.col("index"),
        r=18.0, 
        distance_metric="haversine",
        parallel=True).alias("btree_within_radius")
)

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index,btree_within_radius
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32,u32
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0,40
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1,89
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2,45
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3,59
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4,90
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.773833,995,"""b""",0,0.81024,"""b""",0.808594,0.798673,0.776158,0.799283,0.96809,-0.80334,995,81
-0.114875,996,"""b""",0,0.435576,"""b""",0.182852,0.952257,0.663921,0.68566,0.079458,-0.682739,996,63
-0.897967,997,"""b""",1,0.799476,"""b""",0.131082,0.389023,0.65403,0.669834,0.847043,-0.844666,997,90
-0.855473,998,"""b""",1,0.12903,"""b""",0.794842,0.794882,0.297178,0.113799,0.990062,-0.088007,998,106


In [76]:
# returns true if the row is within k nearest neighbors of the given point
df.with_columns(
    pds.bt_within_dist_from(
        pl.col("x2"),
        pl.col("x3"),
        pt=[123,78.99],
        r=999999, 
        distance_metric="haversine",
        parallel=True).alias("btree_within_dist_from")
)

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index,btree_within_dist_from
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32,bool
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0,true
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1,true
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2,true
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3,true
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4,true
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.773833,995,"""b""",0,0.81024,"""b""",0.808594,0.798673,0.776158,0.799283,0.96809,-0.80334,995,true
-0.114875,996,"""b""",0,0.435576,"""b""",0.182852,0.952257,0.663921,0.68566,0.079458,-0.682739,996,true
-0.897967,997,"""b""",1,0.799476,"""b""",0.131082,0.389023,0.65403,0.669834,0.847043,-0.844666,997,true
-0.855473,998,"""b""",1,0.12903,"""b""",0.794842,0.794882,0.297178,0.113799,0.990062,-0.088007,998,true


In [77]:
# check if a point is included in the k nearest neighbors of the given point
# Note: This is an exact match so your point could be quite close but now show up
# We can pass in an EPSILON to account for this. THis is defaulted to the EPSILON of the data type in Rust

# pick a random point
point = df.select(pl.col("x2"), pl.col("x3"))
rr = list(point.row(2))
df.with_columns(
    pds.is_bt_knn_from(
        pl.col("x2"),
        pl.col("x3"),
        pt=rr, 
        k=56, 
        distance_metric="haversine",
        parallel=True, epsilon=0.5).alias("btree_nn_within")
)

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,index,btree_nn_within
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,u32,bool
0.0,0,"""a""",0,0.491635,"""a""",0.711689,0.624497,0.03681,0.51618,0.714925,0.238894,0,true
0.841471,1,"""a""",1,0.01081,"""a""",0.990379,0.273985,0.691863,0.448978,0.258321,-0.806979,1,false
0.909297,2,"""a""",1,0.016883,"""a""",0.079827,0.349622,0.012541,0.348881,0.60795,0.098134,2,true
0.14112,3,"""a""",1,0.738627,"""a""",0.983878,0.342483,0.915068,0.722341,0.762508,-1.122252,3,false
-0.756802,4,"""a""",0,0.596465,"""a""",0.760799,0.309211,0.687014,0.656174,0.942946,-0.82356,4,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.773833,995,"""b""",0,0.81024,"""b""",0.808594,0.798673,0.776158,0.799283,0.96809,-0.80334,995,false
-0.114875,996,"""b""",0,0.435576,"""b""",0.182852,0.952257,0.663921,0.68566,0.079458,-0.682739,996,false
-0.897967,997,"""b""",1,0.799476,"""b""",0.131082,0.389023,0.65403,0.669834,0.847043,-0.844666,997,false
-0.855473,998,"""b""",1,0.12903,"""b""",0.794842,0.794882,0.297178,0.113799,0.990062,-0.088007,998,true
