In [1]:
import polars as pl
import numpy as np 
# import pandas as pd 
from scipy.special import fdtrc
from typing import * 

# Testing text_data.py

In [None]:
from text_data import transform_text_data

In [None]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."
line3 = "Hello World!"

In [None]:
df = pl.from_records([[line1, line2, line3], ["A", "B", "C"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

In [None]:
df_transformed, reverse_memo = transform_text_data(df, text_cols=["text_col"], min_df=0, max_df=1)

In [None]:
df_transformed

In [None]:
reverse_memo

# Test Other EDA Methods (eda.py) 

In [2]:
import polars as pl
from eda import information_gain, constant_removal, binary_transform, f_score, var_removal

In [3]:
df = pl.read_csv("./data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test")
])
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Age Band,Daily Internet Usage Band,Area Income Band,Test
f64,i64,f64,f64,str,str,i64,str,str,i64,i64,f64,f64,i32
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""2016-03-27 00:…",0,30,12.0,12.0,0.0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""2016-04-04 01:…",0,30,9.0,13.0,
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""2016-03-13 20:…",0,20,11.0,11.0,0.0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""2016-01-10 02:…",0,20,12.0,10.0,
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""2016-06-03 03:…",0,30,11.0,14.0,0.0


In [4]:
df = var_removal(df, threshold=0.5, target="Clicked on Ad")

The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test']
Removed a total of 2 columns.


Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,Age Band,Daily Internet Usage Band,Area Income Band
f64,i64,f64,f64,str,str,str,str,i64,i64,f64,f64
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""2016-03-27 00:…",0,30,12.0,12.0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""","""Nauru""","""2016-04-04 01:…",0,30,9.0,13.0
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""","""San Marino""","""2016-03-13 20:…",0,20,11.0,11.0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…","""Italy""","""2016-01-10 02:…",0,20,12.0,10.0
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""","""Iceland""","""2016-06-03 03:…",0,30,11.0,14.0
59.99,23,59761.56,226.74,"""Sharable clien…","""Jamieberg""","""Norway""","""2016-05-19 14:…",0,20,11.0,11.0
88.91,33,53852.85,208.36,"""Enhanced dedic…","""Brandonstad""","""Myanmar""","""2016-01-28 20:…",0,30,10.0,10.0
66.0,48,24593.33,131.76,"""Reactive local…","""Port Jefferybu…","""Australia""","""2016-03-07 01:…",1,40,6.0,4.0
74.53,30,68862.0,221.51,"""Configurable c…","""West Colin""","""Grenada""","""2016-04-18 09:…",0,30,11.0,13.0
69.88,20,55642.32,183.82,"""Mandatory homo…","""Ramirezton""","""Ghana""","""2016-07-11 01:…",0,20,9.0,11.0


In [None]:
df = constant_removal(df)

In [None]:
df, table = binary_transform(df)
df 

In [None]:
# This is obviously redundant transformation
# So we may set exclude = ["Male", "Clicked on Ad"]
table 

In [None]:
cats = ["Age Band", "Male", "Country", "Area Income Band", "Daily Internet Usage Band"]
target = "Clicked on Ad"

In [None]:
information_gain(df, target, cat_cols=cats)

In [None]:
f_score(df, target=target)

In [None]:
# target = "Clicked on Ad"
nums = []
for c,t in zip(df.columns, df.dtypes):
    if t != pl.Utf8 and t != pl.Struct and c != target:
        nums.append(c)

nums 

In [None]:
from sklearn.feature_selection import f_classif
 
f, pv = f_classif(df[nums], df[target])
pl.from_records([nums, f, pv], schema=["feature", "f_score", "p_value"])

In [None]:
df_test = pl.concat([df.clone()] * 100)
df_test.shape

In [None]:
%%timeit 
f_score(df_test, target="Clicked on Ad", num_cols=nums)

In [None]:
%%timeit 
f, pv = f_classif(df_test[nums], df_test[target])
pl.from_records([nums, f, pv], schema=["feature", "f_score", "p_value"])