In [1]:
import polars as pl

# Testing text_data.py

In [2]:
from text_data import transform_text_data

In [3]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."

In [4]:
df = pl.from_records([[line1, line2], ["A", "B"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

text_col,category,Whatever Value
str,str,i32
"""She told me th...","""A""",1
"""He told me tha...","""B""",1


In [5]:
df_transformed, reverse_memo = transform_text_data(df, one_hot_cols=["category"], text_cols=["text_col"], min_df=0, max_df=1)

Performing one-hot encoding and basic cleaning...
Perfoming stemming...
Performing Count vectorization for text_col...


In [6]:
df_transformed

Whatever Value,category::A,category::B,text_col::word::doesnt,text_col::word::hike
i32,u8,u8,i64,i64
1,1,0,0,1
1,0,1,1,0


In [7]:
reverse_memo

{'she': ['she'],
 'told': ['told'],
 'me': ['me'],
 'that': ['that'],
 'like': ['likes', 'like'],
 'hike': ['hiking'],
 'he': ['he'],
 'doesnt': ['doesnt']}

# Test eda.py 

In [8]:
from eda import information_gain

In [9]:
df = pl.read_csv("./data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band")
])
df.head() 


Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Age Band,Daily Internet Usage Band,Area Income Band
f64,i64,f64,f64,str,str,i64,str,str,i64,i64,f64,f64
68.95,35,61833.9,256.09,"""Cloned 5thgene...","""Wrightburgh""",0,"""Tunisia""","""2016-03-27 00:...",0,30,12.0,12.0
80.23,31,68441.85,193.77,"""Monitored nati...","""West Jodi""",1,"""Nauru""","""2016-04-04 01:...",0,30,9.0,13.0
69.47,26,59785.94,236.5,"""Organic bottom...","""Davidton""",0,"""San Marino""","""2016-03-13 20:...",0,20,11.0,11.0
74.15,29,54806.18,245.89,"""Triple-buffere...","""West Terrifurt...",1,"""Italy""","""2016-01-10 02:...",0,20,12.0,10.0
68.37,35,73889.99,225.58,"""Robust logisti...","""South Manuel""",0,"""Iceland""","""2016-06-03 03:...",0,30,11.0,14.0


In [10]:
df.select([
    pl.col(x).n_unique() for x in df.columns
])

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Age Band,Daily Internet Usage Band,Area Income Band
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
900,43,1000,966,1000,969,2,237,1000,2,6,9,14


In [11]:
cats = ["Age Band", "Male", "Country", "Area Income Band", "Daily Internet Usage Band"]
target = "Clicked on Ad"

In [12]:
information_gain(df, target, cat_cols=cats)

Finished processing for Age Band. 1/5
Finished processing for Male. 2/5
Finished processing for Country. 3/5
Finished processing for Area Income Band. 4/5
Finished processing for Daily Internet Usage Band. 5/5


Predictive Variable,Conditional Entropy,Target Entropy,Information Gain
str,f64,f64,f64
"""Age Band""",0.569156,0.693147,0.123991
"""Male""",0.692424,0.693147,0.000723
"""Country""",0.558555,0.693147,0.134592
"""Daily Internet...",0.263754,0.693147,0.429394
"""Area Income Ba...",0.551848,0.693147,0.141299
