In [1]:
import polars as pl
import numpy as np 
from typing import * 

# Testing text_data.py

In [None]:
from text_data import transform_text_data

In [None]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."
line3 = "Hello World!"

In [None]:
df = pl.from_records([[line1, line2, line3], ["A", "B", "C"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

In [None]:
df_transformed, reverse_memo = transform_text_data(df, text_cols=["text_col"], min_df=0, max_df=1)

In [None]:
df_transformed

In [None]:
reverse_memo

# Test Other EDA Methods (eda.py) 

In [2]:
import polars as pl
from eda_utils import *

In [3]:
df = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,f64,f64,i32,str,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,1,"""SSS""",0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1,"""SSS""",,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,1,"""SSS""",0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1,"""SSS""",,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,1,"""SSS""",0.0,"""A"""


In [4]:
describe(df)

column,count,null_count,null_pct,n_unique,unique_pct,mean,std,min,max,median,25%,75%
str,f64,f64,f64,u32,f64,f64,f64,str,str,f64,f64,f64
"""Daily Time Spe…",1000.0,0.0,0.0,900,0.9,65.0002,15.853615,"""32.6""","""91.43""",68.215,51.38,78.57
"""Age""",1000.0,0.0,0.0,43,0.043,36.009,8.785562,"""19.0""","""61.0""",35.0,29.0,42.0
"""Area Income""",1000.0,0.0,0.0,1000,1.0,55000.00008,13414.634022,"""13996.5""","""79484.8""",57012.3,47051.02,65496.78
"""Daily Internet…",1000.0,0.0,0.0,966,0.966,180.0001,43.902339,"""104.78""","""269.96""",183.13,138.87,218.8
"""Ad Topic Line""",1000.0,0.0,0.0,1000,1.0,,,"""Adaptive 24hou…","""Visionary reci…",,,
"""City""",1000.0,0.0,0.0,969,0.969,,,"""Adamsbury""","""Zacharyton""",,,
"""Male""",1000.0,0.0,0.0,2,0.002,0.481,0.499889,"""0.0""","""1.0""",0.0,0.0,1.0
"""Country""",1000.0,0.0,0.0,237,0.237,,,"""Afghanistan""","""Zimbabwe""",,,
"""Timestamp""",1000.0,0.0,0.0,997,0.997,,,"""1/1/2016 15:14…","""7/9/2016 16:23…",,,
"""Clicked on Ad""",1000.0,0.0,0.0,2,0.002,0.5,0.50025,"""0.0""","""1.0""",0.5,0.0,1.0


In [None]:
df = var_removal(df, threshold=0.5, target=target)

In [None]:
df = constant_removal(df)

In [None]:
bin_result = binary_encode(df, exclude = [target])

In [None]:
bin_result.transformed.head() 

In [None]:
df = bin_result.transformed
bin_result.mapping 

In [None]:
# Some of these are numerical columns, but we can still treat them as categorical columns
# and feed them into the information gain algorithm.
cats = ["Age Band", "Country", "Area Income Band", "Daily Internet Usage Band", "Test_Binary", "One_Hot_Test"]
target = "Clicked on Ad"

In [None]:
information_gain(df, target, cat_cols=cats)

In [None]:
# If nothing is given, automatically infers string columns as categorical columns.
information_gain(df, target)

In [None]:
# df["Ad Topic Line"].unique() # has 1000 uniques. 
# There is no randomness at all because we can say that there are two distinct subsets that perfects differentiates 0s from 1s

In [None]:
f_test(df, target=target)

In [None]:
nums = get_numeric_cols(df, exclude=[target])
nums  

In [None]:
from sklearn.feature_selection import f_classif
 
f, pv = f_classif(df[nums], df[target])
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

In [None]:
df_test = pl.concat([df.clone()] * 100)
df_test.shape

In [None]:
%%timeit 
f_test(df_test, target="Clicked on Ad", num_cols=nums)

In [None]:
%%timeit 
f, pv = f_classif(df_test[nums], df_test[target])
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

In [None]:
pct_result = percentile_binning(df, num_cols=nums, exclude=[target])

In [None]:
pct_result.transformed.head() 

In [None]:
df = pct_result.transformed
pct_result.mapping.filter(pl.col("feature") == "Age")

In [None]:
one_hot_res = one_hot_encode(df, one_hot_columns=["One_Hot_Test"])

In [None]:
one_hot_res.transformed.head() 

In [None]:
df = one_hot_res.transformed
one_hot_res.mapping

In [None]:
ordinal_res = ordinal_auto_encode(df, ordinal_cols=["City", "Country"])

In [None]:
ordinal_res.transformed.head() 

In [None]:
ordinal_res.mapping.head() 

In [None]:
df = ordinal_res.transformed
final_df = df.drop(["Ad Topic Line", "Timestamp"])
final_df.head() 

In [None]:
np_data = get_numpy(final_df, target)
X, y, features = np_data.X, np_data.y, np_data.features

In [None]:
X.shape 

In [None]:
y.shape 

In [None]:
features