In [None]:
import polars as pl
import numpy as np
import dsds.metrics as me
import dsds.prescreen as ps
import dsds.sample as sa
import dsds.fs as fs
import dsds.transform as t
import numpy as np 

In [None]:
df = pl.DataFrame({
    "a": np.random.random(size=100_000),
    "b": np.random.random(size=100_000),
    "test": range(100_000)
})



In [None]:
benford_dist_series = (1 + 1 / pl.int_range(1, 10, eager=True)).log10()
def benford_correlation2(x: pl.Expr) -> pl.Expr:
    
    counts = (
        pl.when(x.abs() == 1000).then(
            pl.lit(1)
        ).otherwise(
            (x.abs()/(pl.lit(10).pow((x.abs().log10()).floor())))
        ).drop_nans()
        .drop_nulls()
        .cast(pl.UInt8)
        .append(pl.int_range(1, 10, eager=False))
        .sort()
        .value_counts()
        .struct.field("counts") - pl.lit(1)
    )
    # no need to divide because correlation is invariant under scaling
    return pl.corr(counts, pl.lit(benford_dist_series))


def benford_correlation(x: pl.Expr) -> pl.Expr:
    """Returns the correlation between the first digit distribution of the input time series and the Newcomb-Benford's Law distribution [1][2].

    Parameters
    ----------
    x : pl.Expr | pl.Series
        Input time-series.

    Returns
    -------
    float

    Notes
    -----
    The Newcomb-Benford distribution for d that is the leading digit of the number {1, 2, 3, 4, 5, 6, 7, 8, 9} is given by:

    .. math::

        P(d) = \\log_{10}\\left(1 + \\frac{1}{d}\\right)

    References
    ----------
    [1] Hill, T. P. (1995). A Statistical Derivation of the Significant-Digit Law. Statistical Science.
    [2] Hill, T. P. (1995). The significant-digit phenomenon. The American Mathematical Monthly.
    [3] Benford, F. (1938). The law of anomalous numbers. Proceedings of the American philosophical society.
    [4] Newcomb, S. (1881). Note on the frequency of use of the different digits in natural numbers. American Journal of
        mathematics.
    """
    x = x.cast(pl.Utf8).str.strip_chars_start("-0.")
    x = (
        x.filter(x != "")
        .str.slice(0, 1)
        .cast(pl.UInt8)
        .append(pl.int_range(1, 10, eager=False))
        .sort()
        .value_counts()
    )
    counts = x.struct[1] - 1
    return pl.corr(
        counts, pl.lit(benford_dist_series)
    )

In [None]:
df.select(
    benford_correlation(pl.col("a")).alias("old"),
    benford_correlation2(pl.col("a")).alias("rewrite"),
)

In [None]:
%%timeit
df.select(
    benford_correlation(pl.col("a")).alias("old"),
)

In [None]:
%%timeit
df.select(
    benford_correlation2(pl.col("a")).alias("rewrite"),
)

In [None]:
import tsfresh.feature_extraction.feature_calculators as fc

fc.energy_ratio_by_chunks(df["a"].to_numpy(), [{"num_segments":5, "segment_focus":0}, {"num_segments":5, "segment_focus":1}])

In [None]:
%%timeit
df.select(
    energy_ratio_rewrite(pl.col("a"), num_segments=5)
)

In [None]:
%%timeit
energy_ratios = [{"num_segments":5, "segment_focus":i} for i in range(5)]
fc.energy_ratio_by_chunks(df["a"].to_numpy(), energy_ratios)

In [None]:
%%timeit
df.select(
    energy_ratio_rewrite(pl.col("a"), num_segments=5, focus=[0,1])
)

In [None]:
df.lazy().select(
    pl.col("a").len()
).collect()

In [None]:
df.select(
    pl.col("a").rolling_sum(window_size=(pl.count()//5))
)

In [None]:
df.select(
    (pl.col("a") * pl.col("a").shift(3) * pl.col("a").shift(6)).sum() / (pl.col("a").count() - pl.lit(6))
).item(0,0)

In [None]:
actual = np.random.random(size=100_000)
predicted = np.random.random(size=100_000)


In [None]:
np.histogram(a, bins = 10)

In [None]:
import numpy as np
df = pl.DataFrame({
    "a": np.random.random(size=100_000)
})

# np.random.random(size=100_000)

In [None]:
df = pl.read_parquet("../data/dunnhumby.parquet")
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size=0.75)

In [None]:
train, test = train_test_split(df, train_size=0.75)

In [None]:
df = pl.DataFrame({
    "text1":["abc,ggg", "abc,sss", "ccc,abc"],
    "text2":["aaa,bbb", "ccc,aaa", "bbb,ccc"]
})

In [None]:
ps.infer_multicategorical(df, separator=",")

In [None]:
df = pl.DataFrame({
    "time": ["2021-01-01", "2021-01-03", "2021-02-01","2021-02-11","2021-03-01","2021-03-02"],
    "a1": [None, 1,2,3,4, None,],
    "a2": [1,2, None,3,4, None,],
})
df = df.with_columns(pl.col("time").str.to_date())
ps.over_time_report(df, cols=["a1", "a2"], time_col="time", metrics=["null", "invalid", "min", "max"])

In [None]:
df.select(
    pl.when(pl.col("a1")==1).then("a").when(pl.col("a2")==2).then("b").otherwise("c")
)

In [None]:
df2 = pl.concat([df for _ in range(10000)])
df2.shape

In [None]:
%%timeit
for frames in sa.time_window_slide(df2, "time", interval="monthly",length=3):
    first, second, third = frames

In [None]:
int(3.5)

In [None]:
df = pl.concat([pl.read_csv("../data/advertising.csv") for _ in range(10)])

In [None]:
from dsds.sklearn_compat_transfom import PolarsExprTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(transform_output = "pandas")

In [None]:
df = df.select("Daily Time Spent on Site", "Age", "Area Income", "Male", "Clicked on Ad")

In [None]:
df

In [None]:
exprs = [pl.col("Age").log(), (pl.col("Clicked on Ad") + pl.lit(2)).alias("test_pl")]
pipe = Pipeline([
    ("StandardScaler", StandardScaler()),
    ("exprs", PolarsExprTransformer(exprs))
])

In [None]:
pipe.fit(df.to_pandas())

In [None]:
pipe.transform(df.to_pandas())

In [None]:
df.group_by("One_Hot_Test").count()

In [None]:
783 * 0.5

In [None]:
pl.show_versions()

In [None]:
t.scale(df, cols=["a","b"], strategy="robust")

In [None]:
def test(value: lambda x: x+1):
    return value 

In [None]:
test(1)

In [None]:
test = pl.DataFrame(
    {
        "a":[["a", None], ["b", None]]
    }
)

test.select(
    pl.col("a").list.set_difference(pl.Series("test", [None]))
)

In [None]:
df = pl.read_csv("../data/advertising.csv")
df.head()

In [None]:
a = np.random.random(size=(5000,))
b = np.random.random(size=(5000,))

In [None]:
me.psi(a,b)

In [None]:
predicted = np.random.random(size=(5000,3))
actual = np.round(np.random.random(size=(5000,3))).astype(np.int8)

In [None]:
me.precision_recall(actual, predicted)

In [None]:
df = pl.DataFrame({
    "predicted": predicted[:, 1],
    "actual": actual[:, 1]
})

In [None]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(actual, predicted, average="weighted"))
print(me.roc_auc(actual, predicted, strategy="balanced"))

In [None]:
%%timeit
roc_auc_score(actual, predicted)

In [None]:
%%timeit
me.roc_auc(actual, predicted, strategy="none")

In [None]:
%%timeit
roc_auc_score(actual, predicted)

In [None]:
%%timeit
me.roc_auc(actual, predicted)

In [None]:
import dsds.prescreen as ps
print(ps.corr_table(df, cols=["Age", "Daily Internet Usage"], corr_with=["Clicked on Ad", "Age Band"]))

In [None]:
df2 = (
    df.lazy().select(["Area Income", "Ad Topic Line", "City", "Clicked on Ad"])
    .drop(["Ad Topic Line"])
).collect()

In [None]:
df2.show_graph()

In [None]:
from scipy.fft import fft
a = np.array([1,2,3,1,2,3,1,2,3])
fft(a)

In [None]:
df = pl.read_csv("../data/train.csv").with_columns(
    pl.lit(1).alias("feature_1"),
    pl.lit(2).alias("feature_2")
)

In [None]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures

In [None]:
download_robot_execution_failures()
timeseries, y = load_robot_execution_failures()

In [None]:
timeseries

In [None]:
from tsfresh import extract_features
extracted_features = extract_features(timeseries, column_id="id", column_sort="time")

In [None]:
extracted_features

In [None]:
test = ["a", "b", "c"]

"a" not in test 