In [14]:
import polars as pl
import pandas as pd 

# Testing text_data.py

In [2]:
from text_data import transform_text_data

In [3]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."
line3 = "Hello World!"

In [4]:
df = pl.from_records([[line1, line2, line3], ["A", "B", "C"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

text_col,category,Whatever Value
str,str,i32
"""She told me th…","""A""",1
"""He told me tha…","""B""",1
"""Hello World!""","""C""",1


In [5]:
df_transformed, reverse_memo = transform_text_data(df, text_cols=["text_col"], min_df=0, max_df=1)

Perfoming stemming...
Performing Count vectorization for text_col...


In [6]:
df_transformed

category,Whatever Value,text_col::word::doesnt,text_col::word::hello,text_col::word::hike,text_col::word::world
str,i32,i64,i64,i64,i64
"""A""",1,0,0,1,0
"""B""",1,1,0,0,0
"""C""",1,0,1,0,1


In [7]:
reverse_memo

{'she': ['she'],
 'told': ['told'],
 'me': ['me'],
 'that': ['that'],
 'like': ['likes', 'like'],
 'hike': ['hiking'],
 'he': ['he'],
 'doesnt': ['doesnt'],
 'hello': ['hello'],
 'world': ['world']}

# Test eda.py 

In [8]:
from eda import information_gain, constant_removal, binary_transform

In [15]:
df = pl.read_csv("./data/advertising.csv").with_columns([
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test")
])
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Test
f64,i64,f64,f64,str,str,i64,str,str,i64,i32
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""2016-03-27 00:…",0,0.0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""2016-04-04 01:…",0,
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""2016-03-13 20:…",0,0.0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""2016-01-10 02:…",0,
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""2016-06-03 03:…",0,0.0


In [10]:
df = constant_removal(df)

The following columns are dropped because they are constants. ['Test'].
Removed a total of 1 columns.


In [12]:
df, table = binary_transform(df)
df 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
f64,i64,f64,f64,str,str,i32,str,str,i32
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""2016-03-27 00:…",0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""2016-04-04 01:…",0
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""2016-03-13 20:…",0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""2016-01-10 02:…",0
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""2016-06-03 03:…",0
59.99,23,59761.56,226.74,"""Sharable clien…","""Jamieberg""",1,"""Norway""","""2016-05-19 14:…",0
88.91,33,53852.85,208.36,"""Enhanced dedic…","""Brandonstad""",0,"""Myanmar""","""2016-01-28 20:…",0
66.0,48,24593.33,131.76,"""Reactive local…","""Port Jefferybu…",1,"""Australia""","""2016-03-07 01:…",1
74.53,30,68862.0,221.51,"""Configurable c…","""West Colin""",1,"""Grenada""","""2016-04-18 09:…",0
69.88,20,55642.32,183.82,"""Mandatory homo…","""Ramirezton""",1,"""Ghana""","""2016-07-11 01:…",0


In [13]:
# This is obviously redundant transformation
# So we may put set exclude = ["Male", "Clicked on Ad"]
table 

feature,to_0,to_1,dtype
str,str,str,str
"""Male""","""0""","""1""","""numeric"""
"""Clicked on Ad""","""0""","""1""","""numeric"""


In [None]:
df = pl.read_csv("./data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band")
])
df.head() 


In [None]:
df.select([
    pl.col(x).n_unique() for x in df.columns
])

In [None]:
cats = ["Age Band", "Male", "Country", "Area Income Band", "Daily Internet Usage Band"]
target = "Clicked on Ad"

In [None]:
information_gain(df, target, cat_cols=cats)