In [1]:
from __future__ import annotations
import polars as pl
import inspect
import numpy as np
from typing import Tuple
import dsds.metrics as me
import dsds.prescreen as ps
import dsds.sample as sa
import dsds.fs as fs
import dsds.transform as t

In [2]:
df = pl.read_csv("../data/advertising.csv")
df.head(3)

id,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
i64,f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,i64,i64,i64,str,i64,str
1,68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12,12,1,"""SSS""",0.0,"""A"""
2,80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9,13,1,"""SSS""",,"""B"""
3,69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11,11,1,"""SSS""",0.0,"""A"""


In [3]:
from dsds._rust import rs_cnt_vectorizer, rs_get_stem_table, rs_snowball_stem
import dsds.test_text as tt
c = "Ad Topic Line"

In [9]:
%%timeit
df2 = rs_cnt_vectorizer(df, c, r'[^\s\w\d%]', 0.02, 0.95, 2000)


5.22 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit
df3 = tt.count_vectorizer(df, c, tokenizer=" ", min_dfreq=0.02, max_dfreq=0.95, max_features=2000)


57.4 ms ± 733 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = df[c].to_list()

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=0.02)
X = np.array(vectorizer.fit_transform(corpus).todense())
cols = vectorizer.get_feature_names_out().tolist()
df_words = pl.from_numpy(X, schema=cols)
df_combined = pl.concat([df, df_words], how="horizontal")
df_combined

In [None]:
cols

In [None]:
df_combined["system"].sum()

In [8]:
rs_get_stem_table(df, c, '[^\s\w\d%]', 0.02, 0.95, 2000)

stemmed,Ad Topic Line,doc_freq
str,list[str],f32
"""interfac""","[""interface""]",0.033
"""secur""","[""secured""]",0.023
"""analyz""","[""analyzer"", ""analyzing""]",0.02
"""system""","[""system"", ""systemic""]",0.036
"""implement""","[""implemented"", ""implementation""]",0.021
"""user""","[""user""]",0.024
"""solut""","[""solution""]",0.029
"""local""","[""local""]",0.023
"""network""","[""network"", ""networked""]",0.022
"""optim""","[""optimized"", ""optimizing"", ""optimal""]",0.025


In [None]:
df2["Ad Topic Line::cnt_system"].sum()

In [None]:
df2["Ad Topic Line::cnt_analyz"].sum()

In [None]:
df_words = pl.from_numpy(X, schema=cols)
df_words

In [None]:
df = pl.read_csv("../data/advertising.csv")
df.head()

In [None]:
import dsds.metrics as me
from sklearn.metrics import log_loss

def pure_numpy_logloss(y_actual:np.ndarray, y_predict:np.ndarray):
    return -np.mean(y_actual * np.log(y_predict) + (1 - y_actual) * np.log(1 - y_predict))

pred = np.random.random(size=500_000) # Some random fake predictions
actual = np.round(np.random.random(size=500_000)).astype(np.int8) # Some random fake actual labels
# Yielding the same result up to 12 digits
print(round(me.logloss(actual, pred), 12) == round(log_loss(actual, pred), 12))
print(round(me.logloss(actual, pred), 12) == round(pure_numpy_logloss(actual, pred), 12))

In [None]:
%%timeit
me.logloss(actual, pred) # dsds

In [None]:
%%timeit
log_loss(actual, pred) # sklearn

In [None]:
%%timeit
pure_numpy_logloss(actual, pred) # pure numpy

In [None]:
# from sklearn.datasets import make_classification
# orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 10, n_informative = 5, n_redundant = 5)
# # This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# # You must turn other dataframe formats into Polars for dsds to work.
# df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# # Turn it into Pandas.
# df_pd = df.to_pandas()
# target = "target"
# features = df.columns
# features.remove(target)

In [None]:
df = pl.DataFrame({
    "a": [1, np.nan, None],
    "b": [1,2,3]
})
df

In [None]:
ps.invalid_inferral(df, threshold=0.5, include_null=True)

In [None]:
ps.discrete_inferral(df)

In [None]:
temp = ps.get_unique_count(df.with_row_count(offset=1).set_sorted("row_nr"), include_null_count=True)
len_df = temp.filter(pl.col("column") == "row_nr").item(0,1)
print(len_df)
temp

In [None]:
from sklearn.datasets import make_classification
orig_x, orig_y = make_classification(n_samples = 50_000, n_features = 50, n_informative = 20, n_redundant = 30)
df_pl = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y))
features = df_pl.columns
features.remove("target")

In [None]:
for train, test in sa.time_series_split(df_pl, n_splits=5, offset = 1000):
    print(train.shape)
    print(test.shape)

In [None]:
%%timeit
fs.discrete_ig(df_pl, target="target", cols=features)

In [None]:
%%timeit
fs.discrete_ig2(df_pl, target="target", cols=features)

In [None]:

me.logloss(actual, pred, check_binary=False)

In [None]:
import nltk
from nltk.corpus import stopwords