In [1]:
import polars as pl
import numpy as np 
# import pandas as pd 
# from scipy.special import fdtrc
from typing import * 

# Testing text_data.py

In [2]:
from text_data import transform_text_data

In [3]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."
line3 = "Hello World!"

In [4]:
df = pl.from_records([[line1, line2, line3], ["A", "B", "C"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

text_col,category,Whatever Value
str,str,i32
"""She told me th…","""A""",1
"""He told me tha…","""B""",1
"""Hello World!""","""C""",1


In [5]:
df_transformed, reverse_memo = transform_text_data(df, text_cols=["text_col"], min_df=0, max_df=1)

Perfoming stemming...
Performing Count vectorization for text_col...


In [6]:
df_transformed

category,Whatever Value,text_col::word::doesnt,text_col::word::hello,text_col::word::hike,text_col::word::world
str,i32,i64,i64,i64,i64
"""A""",1,0,0,1,0
"""B""",1,1,0,0,0
"""C""",1,0,1,0,1


In [7]:
reverse_memo

{'she': ['she'],
 'told': ['told'],
 'me': ['me'],
 'that': ['that'],
 'like': ['likes', 'like'],
 'hike': ['hiking'],
 'he': ['he'],
 'doesnt': ['doesnt'],
 'hello': ['hello'],
 'world': ['world']}

# Test Other EDA Methods (eda.py) 

In [8]:
import polars as pl
from eda import describe, information_gain, constant_removal, binary_transform, f_score, var_removal, get_numeric_cols, percentile_binning

In [9]:
df = pl.read_csv("./data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,i64,f64,f64,i32,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""2016-03-27 00:…",0,30,12.0,12.0,1,0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""2016-04-04 01:…",0,30,9.0,13.0,1,,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""2016-03-13 20:…",0,20,11.0,11.0,1,0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""2016-01-10 02:…",0,20,12.0,10.0,1,,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""2016-06-03 03:…",0,30,11.0,14.0,1,0.0,"""A"""


In [11]:
describe(df)

column,count,null_count,null_pct,unique_count,mean,std,min,max,median,25%,75%
str,f64,f64,f64,u32,f64,f64,str,str,f64,f64,f64
"""Daily Time Spe…",1000.0,0.0,0.0,900,65.0002,15.853615,"""32.6""","""91.43""",68.215,51.38,78.57
"""Age""",1000.0,0.0,0.0,43,36.009,8.785562,"""19.0""","""61.0""",35.0,29.0,42.0
"""Area Income""",1000.0,0.0,0.0,1000,55000.00008,13414.634022,"""13996.5""","""79484.8""",57012.3,47051.02,65496.78
"""Daily Internet…",1000.0,0.0,0.0,966,180.0001,43.902339,"""104.78""","""269.96""",183.13,138.87,218.8
"""Ad Topic Line""",1000.0,0.0,0.0,1000,,,"""Adaptive 24hou…","""Visionary reci…",,,
"""City""",1000.0,0.0,0.0,969,,,"""Adamsbury""","""Zacharyton""",,,
"""Male""",1000.0,0.0,0.0,2,0.481,0.499889,"""0.0""","""1.0""",0.0,0.0,1.0
"""Country""",1000.0,0.0,0.0,237,,,"""Afghanistan""","""Zimbabwe""",,,
"""Timestamp""",1000.0,0.0,0.0,1000,,,"""2016-01-01 02:…","""2016-07-24 00:…",,,
"""Clicked on Ad""",1000.0,0.0,0.0,2,0.5,0.50025,"""0.0""","""1.0""",0.5,0.0,1.0


In [None]:
df.select(
    (pl.col(c).n_unique().alias(c+"_unique_count") for c in df.columns)
).to_numpy().ravel() 

In [None]:
df = var_removal(df, threshold=0.5, target=target)

In [None]:
df = constant_removal(df)

In [None]:
df, table = binary_transform(df, exclude = [target])
df 

In [None]:
# Some of these are numerical columns, but we can still treat them as categorical columns
# and feed them into the information gain algorithm.
cats = ["Age Band", "Country", "Area Income Band", "Daily Internet Usage Band", "Test_Binary"]
target = "Clicked on Ad"

In [None]:
information_gain(df, target, cat_cols=cats)

In [None]:
# If nothing is given, automatically infers string columns as categorical columns.
information_gain(df, target)

In [None]:
df["Ad Topic Line"].unique() 

In [None]:
f_score(df, target=target)

In [None]:
nums = get_numeric_cols(df, exclude=[target])
nums  

In [None]:
from sklearn.feature_selection import f_classif
 
f, pv = f_classif(df[nums], df[target])
pl.from_records([nums, f, pv], schema=["feature", "f_score", "p_value"])

In [None]:
df_test = pl.concat([df.clone()] * 100)
df_test.shape

In [None]:
%%timeit 
f_score(df_test, target="Clicked on Ad", num_cols=nums)

In [None]:
%%timeit 
f, pv = f_classif(df_test[nums], df_test[target])
pl.from_records([nums, f, pv], schema=["feature", "f_score", "p_value"])

In [None]:
transformed_df, ref = percentile_binning(df, num_cols=nums, exclude=[target])

In [None]:
transformed_df 

In [None]:
ref.filter(pl.col("feature") == "Age")