In [None]:
import polars as pl
from typing import *
import sys
sys.path.append('../src')

# General Usage

Here I demonstrate a sketch notebook in a data scientist's job.

In [None]:
import polars as pl
from dsds.utils import get_numpy
from dsds.prescreen import *
from dsds.transform import *
import dsds.fs as fs
# from dsds.fs import discrete_ig, f_classification, mrmr

In [None]:
df = pl.read_csv("../data/advertising.csv").with_columns(
    pl.lit("abc@google.com")
)
target = "Clicked on Ad"
df.head() 

In [None]:
describe(df)

In [None]:
describe_str(df, words_to_count=["A", "Visionary"])

In [None]:
df = email_removal(df)

In [None]:
df = date_removal(df)

In [None]:
df.head()

In [None]:
df = var_removal(df, threshold=0.5, target=target)

In [None]:
df = constant_removal(df)

In [None]:
# This turns binary strings into 0s and 1s based on sort order. This will not transform boolean or numeric binary values.
df = binary_encode(df, exclude = [target])

In [None]:
df.head()

In [None]:
# Second way (imo, better) to interactive with TransformationResult objects
df = impute(df, ["Area Income"], "mean")
df = impute(df, ["Daily Internet Usage", "Daily Internet Usage Band", "Area Income Band"], "median")
df 

In [None]:
df = scale(df, ["Area Income", "Daily Internet Usage"], "standard")
df 

In [None]:
# Some of these are numerical columns, but we can still treat them as "discrete" columns
# and feed them into the information gain algorithm.
discrete = ["Age Band", "Country", "Area Income Band", "Daily Internet Usage Band", "Test_Binary", "One_Hot_Test"]
target = "Clicked on Ad"

In [None]:
fs.discrete_ig(df, target, discrete_cols=discrete)

In [None]:
# If nothing is given, automatically infers discrete columns. (See the docstring of discrete_inferral)
fs.discrete_ig(df, target)

In [None]:
# df["Ad Topic Line"].unique() # has 1000 uniques. 
# There is no randomness at all because we can say that there are two distinct subsets that perfects differentiates 0s from 1s

In [None]:
fs.f_classif(df, target=target)

In [None]:
nums = get_numeric_cols(df, exclude=[target])
nums 

In [None]:
from sklearn.feature_selection import f_classif
 
f, pv = f_classif(df.select(nums), df[target])
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

In [None]:
from sklearn.feature_selection import f_regression

result = f_regression(df[nums].to_numpy(), df[target].to_numpy())
pl.from_records([nums, result[0], result[1]], schema=["feature", "f_value", "p_value"])

In [None]:
df_test = pl.concat([df.clone()] * 500)
df_test.shape

In [None]:
%%timeit 
fs.f_classif(df_test, target="Clicked on Ad", num_cols=nums)

In [None]:
%%timeit 
f, pv = f_classif(df_test[nums].to_numpy(), df_test[target].to_numpy())
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

In [None]:
%%timeit
result = f_regression(df_test[nums].to_numpy(), df_test[target].to_numpy())
pl.from_records([nums, result[0], result[1]], schema=["feature", "f_value", "p_value"])

In [None]:
del df_test 

In [None]:
# MRMR Method 
fs.mrmr(df, target, 5)

In [None]:
df = one_hot_encode(df, cols=["One_Hot_Test"])

In [None]:
df.head()

In [None]:
df = ordinal_auto_encode(df, cols=["City", "Country"])

In [None]:
df.head()

In [None]:
final_df = remove_if_exists(df, ["Ad Topic Line", "Timestamp"]) # Timestamp was removed at the beginning
final_df.head() 

In [None]:
np_data = get_numpy(final_df, target)
X, y, features = np_data.X, np_data.y, np_data.features

In [None]:
X.shape 

In [None]:
y.shape 

In [None]:
features