In [1]:
import polars as pl
import numpy as np 
# import pandas as pd 
# from scipy.special import fdtrc
from typing import * 

# Testing text_data.py

In [2]:
from text_data import transform_text_data

In [3]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."
line3 = "Hello World!"

In [4]:
df = pl.from_records([[line1, line2, line3], ["A", "B", "C"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

text_col,category,Whatever Value
str,str,i32
"""She told me th…","""A""",1
"""He told me tha…","""B""",1
"""Hello World!""","""C""",1


In [5]:
df_transformed, reverse_memo = transform_text_data(df, text_cols=["text_col"], min_df=0, max_df=1)

Perfoming stemming...
Performing Count vectorization for text_col...


In [6]:
df_transformed

category,Whatever Value,text_col::word::doesnt,text_col::word::hello,text_col::word::hike,text_col::word::world
str,i32,i64,i64,i64,i64
"""A""",1,0,0,1,0
"""B""",1,1,0,0,0
"""C""",1,0,1,0,1


In [7]:
reverse_memo

{'she': ['she'],
 'told': ['told'],
 'me': ['me'],
 'that': ['that'],
 'like': ['likes', 'like'],
 'hike': ['hiking'],
 'he': ['he'],
 'doesnt': ['doesnt'],
 'hello': ['hello'],
 'world': ['world']}

# Test Other EDA Methods (eda.py) 

In [8]:
import polars as pl
from eda import describe, information_gain, constant_removal, binary_transform, f_score, var_removal, get_numeric_cols, percentile_binning

In [9]:
df = pl.read_csv("./data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,i64,f64,f64,i32,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""2016-03-27 00:…",0,30,12.0,12.0,1,0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""2016-04-04 01:…",0,30,9.0,13.0,1,,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""2016-03-13 20:…",0,20,11.0,11.0,1,0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""2016-01-10 02:…",0,20,12.0,10.0,1,,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""2016-06-03 03:…",0,30,11.0,14.0,1,0.0,"""A"""


In [10]:
describe(df)

column,count,null_count,mean,std,min,max,median,25%,75%,null_pct
str,f64,f64,f64,f64,str,str,f64,f64,f64,f64
"""Daily Time Spe…",1000.0,0.0,65.0002,15.853615,"""32.6""","""91.43""",68.215,51.38,78.57,0.0
"""Age""",1000.0,0.0,36.009,8.785562,"""19.0""","""61.0""",35.0,29.0,42.0,0.0
"""Area Income""",1000.0,0.0,55000.00008,13414.634022,"""13996.5""","""79484.8""",57012.3,47051.02,65496.78,0.0
"""Daily Internet…",1000.0,0.0,180.0001,43.902339,"""104.78""","""269.96""",183.13,138.87,218.8,0.0
"""Ad Topic Line""",1000.0,0.0,,,"""Adaptive 24hou…","""Visionary reci…",,,,0.0
"""City""",1000.0,0.0,,,"""Adamsbury""","""Zacharyton""",,,,0.0
"""Male""",1000.0,0.0,0.481,0.499889,"""0.0""","""1.0""",0.0,0.0,1.0,0.0
"""Country""",1000.0,0.0,,,"""Afghanistan""","""Zimbabwe""",,,,0.0
"""Timestamp""",1000.0,0.0,,,"""2016-01-01 02:…","""2016-07-24 00:…",,,,0.0
"""Clicked on Ad""",1000.0,0.0,0.5,0.50025,"""0.0""","""1.0""",0.5,0.0,1.0,0.0


In [11]:
df = var_removal(df, threshold=0.5, target=target)

The following numeric columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn']
Removed a total of 3 columns.


In [12]:
df = constant_removal(df)

The following columns are dropped because they are constants. [].
Removed a total of 0 columns.


In [13]:
df, table = binary_transform(df, exclude = [target])
df 

Transforming Test_Binary...


Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,f64,str,str,str,str,i64,i64,f64,f64,u8
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""2016-03-27 00:…",0,30,12.0,12.0,0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""","""Nauru""","""2016-04-04 01:…",0,30,9.0,13.0,1
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""","""San Marino""","""2016-03-13 20:…",0,20,11.0,11.0,0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…","""Italy""","""2016-01-10 02:…",0,20,12.0,10.0,1
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""","""Iceland""","""2016-06-03 03:…",0,30,11.0,14.0,0
59.99,23,59761.56,226.74,"""Sharable clien…","""Jamieberg""","""Norway""","""2016-05-19 14:…",0,20,11.0,11.0,1
88.91,33,53852.85,208.36,"""Enhanced dedic…","""Brandonstad""","""Myanmar""","""2016-01-28 20:…",0,30,10.0,10.0,0
66.0,48,24593.33,131.76,"""Reactive local…","""Port Jefferybu…","""Australia""","""2016-03-07 01:…",1,40,6.0,4.0,1
74.53,30,68862.0,221.51,"""Configurable c…","""West Colin""","""Grenada""","""2016-04-18 09:…",0,30,11.0,13.0,1
69.88,20,55642.32,183.82,"""Mandatory homo…","""Ramirezton""","""Ghana""","""2016-07-11 01:…",0,20,9.0,11.0,1


In [14]:
# Some of these are numerical columns, but we can still treat them as categorical columns
# and feed them into the information gain algorithm.
cats = ["Age Band", "Country", "Area Income Band", "Daily Internet Usage Band", "Test_Binary"]
target = "Clicked on Ad"

In [15]:
information_gain(df, target, cat_cols=cats)

Finished processing for Age Band. Progress: 1/5
Finished processing for Country. Progress: 2/5
Finished processing for Area Income Band. Progress: 3/5
Finished processing for Daily Internet Usage Band. Progress: 4/5
Finished processing for Test_Binary. Progress: 5/5


Predictive Variable,Conditional Entropy,Target Entropy,Information Gain
str,f64,f64,f64
"""Daily Internet…",0.263754,0.693147,0.429394
"""Area Income Ba…",0.551848,0.693147,0.141299
"""Country""",0.558555,0.693147,0.134592
"""Age Band""",0.569156,0.693147,0.123991
"""Test_Binary""",0.692424,0.693147,0.000723


In [16]:
# If nothing is given, automatically infers string columns as categorical columns.
information_gain(df, target)

Finished processing for Ad Topic Line. Progress: 1/4
Finished processing for City. Progress: 2/4
Finished processing for Country. Progress: 3/4
Finished processing for Timestamp. Progress: 4/4


Predictive Variable,Conditional Entropy,Target Entropy,Information Gain
str,f64,f64,f64
"""Ad Topic Line""",0.0,0.693147,0.693147
"""Timestamp""",0.0,0.693147,0.693147
"""City""",0.017682,0.693147,0.675465
"""Country""",0.558555,0.693147,0.134592


In [17]:
df["Ad Topic Line"].unique() 

Ad Topic Line
str
"""Down-sized wel…"
"""Optional multi…"
"""Future-proofed…"
"""Synergistic re…"
"""Profound maxim…"
"""Managed 6thgen…"
"""Fundamental ze…"
"""Seamless intan…"
"""Exclusive disi…"
"""Adaptive unifo…"


In [18]:
f_score(df, target=target)

feature,f_score,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",292.770617,9.4496e-58
"""Daily Internet…",1618.976456,3.8932e-211
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1587.381637,1.6759000000000002e-208
"""Area Income Ba…",289.515369,3.3458000000000004e-57
"""Test_Binary""",1.445286,0.229571


In [19]:
nums = get_numeric_cols(df, exclude=[target])
nums  

['Daily Time Spent on Site',
 'Age',
 'Area Income',
 'Daily Internet Usage',
 'Age Band',
 'Daily Internet Usage Band',
 'Area Income Band',
 'Test_Binary']

In [20]:
from sklearn.feature_selection import f_classif
 
f, pv = f_classif(df[nums], df[target])
pl.from_records([nums, f, pv], schema=["feature", "f_score", "p_value"])

feature,f_score,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",292.770617,9.4496e-58
"""Daily Internet…",1618.976456,3.8932e-211
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1587.381637,1.6759000000000002e-208
"""Area Income Ba…",289.515369,3.3458000000000004e-57
"""Test_Binary""",1.445286,0.229571


In [21]:
df_test = pl.concat([df.clone()] * 100)
df_test.shape

(100000, 13)

In [22]:
%%timeit 
f_score(df_test, target="Clicked on Ad", num_cols=nums)

1.79 ms ± 38.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [23]:
%%timeit 
f, pv = f_classif(df_test[nums], df_test[target])
pl.from_records([nums, f, pv], schema=["feature", "f_score", "p_value"])

11.4 ms ± 93.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [24]:
transformed_df, ref = percentile_binning(df, num_cols=nums, exclude=[target])

In [25]:
transformed_df 

Daily Time Spent on Site_percentile,Age_percentile,Area Income_percentile,Daily Internet Usage_percentile,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,Age Band_percentile,Daily Internet Usage Band_percentile,Area Income Band_percentile,Test_Binary_percentile
u8,u8,u8,u8,str,str,str,str,i64,u8,u8,u8,u8
52,53,65,99,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""2016-03-27 00:…",0,68,100,74,52
81,37,86,56,"""Monitored nati…","""West Jodi""","""Nauru""","""2016-04-04 01:…",0,68,61,89,100
53,14,58,91,"""Organic bottom…","""Davidton""","""San Marino""","""2016-03-13 20:…",0,27,93,58,52
63,27,44,95,"""Triple-buffere…","""West Terrifurt…","""Italy""","""2016-01-10 02:…",0,27,100,45,100
51,53,96,82,"""Robust logisti…","""South Manuel""","""Iceland""","""2016-06-03 03:…",0,68,93,97,52
37,5,58,83,"""Sharable clien…","""Jamieberg""","""Norway""","""2016-05-19 14:…",0,27,93,58,100
99,45,43,68,"""Enhanced dedic…","""Brandonstad""","""Myanmar""","""2016-01-28 20:…",0,68,77,45,52
46,89,3,20,"""Reactive local…","""Port Jefferybu…","""Australia""","""2016-03-07 01:…",1,91,26,3,100
64,31,87,77,"""Configurable c…","""West Colin""","""Grenada""","""2016-04-18 09:…",0,68,93,89,100
54,2,46,51,"""Mandatory homo…","""Ramirezton""","""Ghana""","""2016-07-11 01:…",0,27,61,58,100


In [26]:
ref.filter(pl.col("feature") == "Age")

feature,percentile,min,max,cnt
str,u8,f64,f64,f64
"""Age""",1,19.0,19.0,19.0
"""Age""",2,20.0,21.0,41.0
"""Age""",4,22.0,22.0,22.0
"""Age""",5,23.0,23.0,23.0
"""Age""",8,24.0,24.0,24.0
"""Age""",10,25.0,25.0,25.0
"""Age""",14,26.0,26.0,26.0
"""Age""",17,27.0,27.0,27.0
"""Age""",22,28.0,28.0,28.0
"""Age""",27,29.0,29.0,29.0
