In [1]:
from __future__ import annotations
import polars as pl 
import inspect
import numpy as np

In [None]:
import polars as pl
import random 

df = pl.scan_csv("../data/test.csv")
    
# Define polars custom functions to apply
def add_position_column(df:pl.LazyFrame):
    df = df.with_columns( 
        pl.when(pl.col('defensive_skill') > 50).then('CB')
        .when(pl.col('offensive_skill') > 50).then('FW')
        .otherwise('bench').alias("position")
    )
    return df

def add_squad_number_column(df:pl.LazyFrame):
    df = df.with_columns( 
        pl.when(pl.col('position') == 'CD').then(pl.lit(random.sample(range(2, 6), 1)[0], dtype=pl.Int8))
        .when(pl.col('position') == 'FW').then(pl.lit(random.sample(range(7, 19), 1)[0], dtype=pl.Int8))
        .otherwise('-').alias("squad_number")
    )
    return df

# Chain operations together using the pipe function


df.pipe(add_position_column)\
    .pipe(add_squad_number_column)\
    .write_json("pipe.json")


In [None]:
f = open("pipe.json", "r")
json_str = f.read()
f.close()
df3 = pl.read_csv("../data/test.csv")
df2 = df3.lazy().from_json(json_str)
df2.collect()

In [None]:
plan = pl.LazyFrame().from_json(json_str)
plan.write_json("pipe.json")

In [None]:
df2.collect()

In [None]:
test = ["a","b"]

"|".join(test)

In [None]:
def test() -> list[str]:
    return ["a"]

test.__annotations__.get("return", "") == list[str]

In [None]:
from typing import Callable, Concatenate

tt:Callable[[], list]
tt = test

In [None]:
inspect.signature(tt).return_annotation == "list[str]"

In [None]:
df = pl.DataFrame({
    "g": list(range(1000))
}).lazy()

# test = pl.Series("x",["a", "b"])


In [None]:
join_df = pl.LazyFrame((list(range(1000)), list(i*2 for i in range(1000))), schema=["g", "g_mapped"])
join_df.collect()

In [None]:
%%timeit 
df.with_columns(
    pl.col("g").map_dict({i:2*i for i in range(1000)})
).collect()

In [None]:
%%timeit 
df.join(join_df, on = "g").with_columns(
    pl.col("g_mapped").alias("g")
).drop(columns=["g_mapped"]).collect()

In [None]:
df.join(join_df, on = "g").with_columns(
    pl.col("g_mapped").alias("g")
).drop(columns=["g_mapped"]).write_json("test.json")

In [None]:
df = pl.DataFrame(
    {
        "a": list(range(1000)),
    }
)

In [None]:
from typing import Generator, Tuple, Any

mapping = {i:i for i in range(1000)}
test = mapping.copy()
cname = "a"
def create_map_expr(
        col_name:str
        , gen:Generator[Tuple[str, Any], None, None]
        , default:Any = None
) -> pl.Expr:

    '''
        Suppose you have a dictionary like d = {"a":1, "b":2}. Instead of doing pl.col("column").map_dict(d), you can 
        do pl.when(pl.col("column") == "a").then(1).otherwise(pl.when(pl.col("column") == "b").then(2).otherwise(default))
        instead. This function generators this expression for you from a generator that yields a key value pair.
    
    '''

    try:
        k, v = next(gen)
        return pl.when(pl.col(col_name) == k).then(v).otherwise(create_map_expr(col_name, gen))
    except:
        return pl.lit(default)

In [None]:
%%timeit 
df.select(
    pl.col("a").map_dict(mapping)
)

In [None]:
m = list(range(1000))
expr = create_map_expr(cname, zip(m, m))

In [None]:
%%timeit 

df.select(
    expr
)

In [3]:
df = pl.DataFrame({"a":[None, 1,2,3,4,5,6,7], "b":[1,2,1,1,1,1,1,1]})
df.describe()

describe,a,b
str,f64,f64
"""count""",8.0,8.0
"""null_count""",1.0,0.0
"""mean""",4.0,1.125
"""std""",2.160247,0.353553
"""min""",1.0,1.0
"""max""",7.0,2.0
"""median""",4.0,1.0
"""25%""",2.0,1.0
"""75%""",6.0,1.0


In [5]:
import polars.functions as F

In [17]:
%%timeit
"asdbdasbfa" + "abdsbads" + "asdffkilo"

7.19 ns ± 0.475 ns per loop (mean ± std. dev. of 7 runs, 100,000,000 loops each)


In [25]:
%%timeit 
a + list(b)

290 ns ± 15.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [26]:
%%timeit 
a.extend(b)

785 ns ± 200 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [29]:
%%timeit 
df_metrics = df.lazy().select(
    pl.all().min().prefix("min:"),
    pl.all().max().prefix("max:")
).collect().row(0) # .row(0)
df_metrics

74.5 µs ± 1.05 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [30]:
%%timeit
mins = df.lazy().select(
    pl.all().min().prefix("min:")
).collect().to_numpy().ravel()
maxs = df.lazy().select(
    pl.all().max().prefix("max:")
).collect().to_numpy().ravel()

154 µs ± 840 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [8]:
a = [1,2,3,4]
a.extend(range(5,10))
a 

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
# Compute error, because pl.col(c) returns columns of different len depending on unique values
df.select(
    pl.col(c).unique().sort() for c in df.columns
)

In [None]:
cs = df.lazy().groupby(1).agg(
    pl.col(c).unique().sort() for c in df.columns
).select(
    pl.col(c) for c in df.columns
).collect().get_columns()

for c in cs:
    print(c[0])

In [None]:
for f in df.partition_by("a"):
    print(f.shape)

In [None]:
df.select(
    pl.col(c).n_unique() for c in df.columns
)

In [None]:
import numpy as np

d = pl.DataFrame(
    {
        'num': np.random.random(1000),
    }
)
plan = d.lazy().select(
    new_col = (pl.col("num") - 2.55678623)/1.11111111
)
plan.write_json("test.json")
d2 = d.clone()
f = open("test.json", "r")
json_str = f.read()
f.close()
test = d2.lazy().from_json(json_str).rename({"new_col":"new_col2"})

different_values = plan.collect()["new_col"] != test.collect()["new_col2"]
print(different_values.sum())
combined = pl.concat([plan.collect(), test.collect()], how="horizontal")
data = combined.filter(pl.col("new_col") != pl.col("new_col2"))
print(data[1, 0])
print(data[1, 1])


In [None]:
d.select(pl.col("bools").sum() / 2)

In [None]:
d.lazy().select(pl.col("num").mean()).collect().to_numpy()[0,0]

In [None]:
d.select(
    pl.col("events").list.unique().list.lengths()
)

In [None]:
from sklearn.datasets import make_classification

In [None]:
orig_x, orig_y = make_classification(n_samples = 300_000, n_features = 50, n_informative = 25, n_redundant = 25)
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y))

In [None]:
features = df.columns
features.remove("target")

In [None]:
x = df.select(features)
v = pl.Series([2]*len(features))

print(x.shape)
print(v.shape)

In [None]:
%%timeit
x.select(
    pl.col(c) * v[i] for i, c in enumerate(x.columns)
).fold(lambda s1, s2: s1 + s2)

In [None]:
%%timeit
x.select(
    sum(pl.col(c) * v[i] for i, c in enumerate(x.columns))
)

In [None]:
w = pl.Series([2]*200)


In [None]:
def gradient_descent(df:pl.DataFrame, features:list[str], target:str):
    
    x = df.select(features)
    y = df.select(target)

    weights = pl.Series([1]*len(features))
    const = 0.

    


In [None]:
import polars as pl
from dsds.prescreen import describe_str
from dsds.transform import ScalingStrategy

In [None]:
df = pl.read_csv("../data/advertising.csv")

In [None]:
describe_str(df, words_to_count=["A"])