In [1]:
import polars as pl
import numpy as np 
from typing import * 

# Testing text_data.py

In [2]:
from text_data import transform_text_data

In [3]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."
line3 = "Hello World!"

In [4]:
df = pl.from_records([[line1, line2, line3], ["A", "B", "C"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

text_col,category,Whatever Value
str,str,i32
"""She told me th…","""A""",1
"""He told me tha…","""B""",1
"""Hello World!""","""C""",1


In [5]:
df_transformed, reverse_memo = transform_text_data(df, text_cols=["text_col"], min_df=0, max_df=1)

Perfoming stemming...
Performing Count vectorization for text_col...


In [6]:
df_transformed

category,Whatever Value,text_col::word::doesnt,text_col::word::hello,text_col::word::hike,text_col::word::world
str,i32,i64,i64,i64,i64
"""A""",1,0,0,1,0
"""B""",1,1,0,0,0
"""C""",1,0,1,0,1


In [7]:
reverse_memo

{'she': ['she'],
 'told': ['told'],
 'me': ['me'],
 'that': ['that'],
 'like': ['likes', 'like'],
 'hike': ['hiking'],
 'he': ['he'],
 'doesnt': ['doesnt'],
 'hello': ['hello'],
 'world': ['world']}

# Test Other EDA Methods (eda.py) 

In [8]:
import polars as pl
from eda_utils import *

In [9]:
df = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,f64,f64,i32,str,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,1,"""SSS""",0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1,"""SSS""",,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,1,"""SSS""",0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1,"""SSS""",,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,1,"""SSS""",0.0,"""A"""


In [10]:
describe(df)

column,count,null_count,null_pct,n_unique,unique_pct,mean,std,min,max,median,25%,75%
str,f64,f64,f64,u32,f64,f64,f64,str,str,f64,f64,f64
"""Daily Time Spe…",1000.0,0.0,0.0,900,0.9,65.0002,15.853615,"""32.6""","""91.43""",68.215,51.38,78.57
"""Age""",1000.0,0.0,0.0,43,0.043,36.009,8.785562,"""19.0""","""61.0""",35.0,29.0,42.0
"""Area Income""",1000.0,0.0,0.0,1000,1.0,55000.00008,13414.634022,"""13996.5""","""79484.8""",57012.3,47051.02,65496.78
"""Daily Internet…",1000.0,0.0,0.0,966,0.966,180.0001,43.902339,"""104.78""","""269.96""",183.13,138.87,218.8
"""Ad Topic Line""",1000.0,0.0,0.0,1000,1.0,,,"""Adaptive 24hou…","""Visionary reci…",,,
"""City""",1000.0,0.0,0.0,969,0.969,,,"""Adamsbury""","""Zacharyton""",,,
"""Male""",1000.0,0.0,0.0,2,0.002,0.481,0.499889,"""0.0""","""1.0""",0.0,0.0,1.0
"""Country""",1000.0,0.0,0.0,237,0.237,,,"""Afghanistan""","""Zimbabwe""",,,
"""Timestamp""",1000.0,0.0,0.0,997,0.997,,,"""1/1/2016 15:14…","""7/9/2016 16:23…",,,
"""Clicked on Ad""",1000.0,0.0,0.0,2,0.002,0.5,0.50025,"""0.0""","""1.0""",0.5,0.0,1.0


In [11]:
df = var_removal(df, threshold=0.5, target=target)

The following numeric columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn']
Removed a total of 3 columns.


In [12]:
df = constant_removal(df)

The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.


In [13]:
bin_result = binary_encode(df, exclude = [target])

Transforming Test_Binary into a binary column with [0, 1] ...


In [14]:
bin_result.transformed.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,f64,str,str,str,str,i64,str,i64,f64,f64,u8
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,0


In [15]:
df = bin_result.transformed
bin_result.mapping 

feature,to_0,to_1,dtype
str,str,str,str
"""Test_Binary""","""A""","""B""","""string"""


In [16]:
# Some of these are numerical columns, but we can still treat them as categorical columns
# and feed them into the information gain algorithm.
cats = ["Age Band", "Country", "Area Income Band", "Daily Internet Usage Band", "Test_Binary", "One_Hot_Test"]
target = "Clicked on Ad"

In [17]:
information_gain(df, target, cat_cols=cats)

Finished processing for Age Band. Progress: 1/6
Finished processing for Country. Progress: 2/6
Finished processing for Area Income Band. Progress: 3/6
Finished processing for Daily Internet Usage Band. Progress: 4/6
Finished processing for Test_Binary. Progress: 5/6
Finished processing for One_Hot_Test. Progress: 6/6


feature,target_entropy,conditional_entropy,unique_pct,information_gain,weighted_information_gain
str,f64,f64,f64,f64,f64
"""Daily Internet…",0.693147,0.263754,0.009,0.429394,0.425529
"""Area Income Ba…",0.693147,0.551848,0.014,0.141299,0.139321
"""Country""",0.693147,0.558555,0.237,0.134592,0.102694
"""Age Band""",0.693147,0.569156,0.006,0.123991,0.123247
"""One_Hot_Test""",0.693147,0.692173,0.003,0.000975,0.000972
"""Test_Binary""",0.693147,0.692424,0.002,0.000723,0.000722


In [18]:
# If nothing is given, automatically infers string columns as categorical columns.
information_gain(df, target)

Finished processing for Ad Topic Line. Progress: 1/5
Finished processing for City. Progress: 2/5
Finished processing for Country. Progress: 3/5
Finished processing for Timestamp. Progress: 4/5
Finished processing for One_Hot_Test. Progress: 5/5


feature,target_entropy,conditional_entropy,unique_pct,information_gain,weighted_information_gain
str,f64,f64,f64,f64,f64
"""Ad Topic Line""",0.693147,0.0,1.0,0.693147,0.0
"""Timestamp""",0.693147,0.001386,0.997,0.691761,0.002075
"""City""",0.693147,0.017682,0.969,0.675465,0.020939
"""Country""",0.693147,0.558555,0.237,0.134592,0.102694
"""One_Hot_Test""",0.693147,0.692173,0.003,0.000975,0.000972


In [19]:
# df["Ad Topic Line"].unique() # has 1000 uniques. 
# There is no randomness at all because we can say that there are two distinct subsets that perfects differentiates 0s from 1s

In [20]:
f_classification(df, target=target)

feature,f_value,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",292.770617,9.4496e-58
"""Daily Internet…",1618.976456,3.8932e-211
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1587.381637,1.6759000000000002e-208
"""Area Income Ba…",289.515369,3.3458000000000004e-57
"""Test_Binary""",1.445286,0.229571


In [21]:
nums = get_numeric_cols(df, exclude=[target])
nums  

['Daily Time Spent on Site',
 'Age',
 'Area Income',
 'Daily Internet Usage',
 'Age Band',
 'Daily Internet Usage Band',
 'Area Income Band',
 'Test_Binary']

In [22]:
from sklearn.feature_selection import f_classif
 
f, pv = f_classif(df[nums], df[target])
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

feature,f_value,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",292.770617,9.4496e-58
"""Daily Internet…",1618.976456,3.8932e-211
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1587.381637,1.6759000000000002e-208
"""Area Income Ba…",289.515369,3.3458000000000004e-57
"""Test_Binary""",1.445286,0.229571


In [23]:
from sklearn.feature_selection import f_regression

result = f_regression(df[nums].to_numpy(), df[target].to_numpy())
pl.from_records([nums, result[0], result[1]], schema=["feature", "f_value", "p_value"])

feature,f_value,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",292.770617,9.4496e-58
"""Daily Internet…",1618.976456,3.8932e-211
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1587.381637,1.6759000000000002e-208
"""Area Income Ba…",289.515369,3.3458000000000004e-57
"""Test_Binary""",1.445286,0.229571


In [24]:
df_test = pl.concat([df.clone()] * 500)
df_test.shape

(500000, 14)

In [25]:
%%timeit 
f_classification(df_test, target="Clicked on Ad", num_cols=nums)

6.55 ms ± 260 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%%timeit 
f, pv = f_classif(df_test[nums].to_numpy(), df_test[target].to_numpy())
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

57.6 ms ± 962 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
%%timeit
result = f_regression(df_test[nums].to_numpy(), df_test[target].to_numpy())
pl.from_records([nums, result[0], result[1]], schema=["feature", "f_value", "p_value"])

19 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
del df_test 

In [29]:
mrmr(df, target, 5)

Feature importance (by strategy MRMR_STRATEGY.F_SCORE) for each feature is:
[('Daily Time Spent on Site', 1268.525160906559), ('Age', 319.6431651611977), ('Area Income', 292.7706167234066), ('Daily Internet Usage', 1618.9764559901303), ('Age Band', 269.8757975134233), ('Daily Internet Usage Band', 1587.3816374117), ('Area Income Band', 289.5153685950107), ('Test_Binary', 1.4452859859273572)]
Found 1st feature: Daily Internet Usage. 1/5
Found 2th feature: Daily Time Spent on Site. 2/5
Found 3th feature: Daily Internet Usage Band. 3/5
Found 4th feature: Area Income. 4/5
Found 5th feature: Age. 5/5


mrmr_rank,feature
i64,str
1,"""Daily Internet…"
2,"""Daily Time Spe…"
3,"""Daily Internet…"
4,"""Area Income"""
5,"""Age"""


In [30]:
pct_result = percentile_binning(df, num_cols=nums, exclude=[target])

In [31]:
pct_result.transformed.head() 

Daily Time Spent on Site_percentile,Age_percentile,Area Income_percentile,Daily Internet Usage_percentile,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band_percentile,Daily Internet Usage Band_percentile,Area Income Band_percentile,Test_Binary_percentile
u8,u8,u8,u8,str,str,str,str,i64,str,u8,u8,u8,u8
52,53,65,99,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,"""A""",68,100,74,52
81,37,86,56,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,"""B""",68,61,89,100
53,14,58,91,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,"""A""",27,93,58,52
63,27,44,95,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,"""B""",27,100,45,100
51,53,96,82,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,"""A""",68,93,97,52


In [32]:
df = pct_result.transformed
pct_result.mapping.filter(pl.col("feature") == "Age")

feature,percentile,min,max,cnt
str,u8,f32,f32,u32
"""Age""",1,19.0,19.0,6
"""Age""",2,20.0,21.0,12
"""Age""",4,22.0,22.0,13
"""Age""",5,23.0,23.0,19
"""Age""",8,24.0,24.0,21
"""Age""",10,25.0,25.0,27
"""Age""",14,26.0,26.0,37
"""Age""",17,27.0,27.0,33
"""Age""",22,28.0,28.0,48
"""Age""",27,29.0,29.0,48


In [33]:
one_hot_res = one_hot_encode(df, one_hot_columns=["One_Hot_Test"])

In [34]:
one_hot_res.transformed.head() 

Daily Time Spent on Site_percentile,Age_percentile,Area Income_percentile,Daily Internet Usage_percentile,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band_percentile,Daily Internet Usage Band_percentile,Area Income Band_percentile,Test_Binary_percentile
u8,u8,u8,u8,str,str,str,str,i64,u8,u8,u8,u8,u8,u8,u8
52,53,65,99,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,1,0,0,68,100,74,52
81,37,86,56,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,0,1,0,68,61,89,100
53,14,58,91,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,1,0,0,27,93,58,52
63,27,44,95,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,0,1,0,27,100,45,100
51,53,96,82,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,1,0,0,68,93,97,52


In [35]:
df = one_hot_res.transformed
one_hot_res.mapping

feature,one_hot_derived
str,str
"""One_Hot_Test""","""One_Hot_Test_A…"
"""One_Hot_Test""","""One_Hot_Test_B…"
"""One_Hot_Test""","""One_Hot_Test_C…"


In [36]:
ordinal_res = ordinal_auto_encode(df, ordinal_cols=["City", "Country"])

In [37]:
ordinal_res.transformed.head() 

Daily Time Spent on Site_percentile,Age_percentile,Area Income_percentile,Daily Internet Usage_percentile,Ad Topic Line,City_ordinal,Country_ordinal,Timestamp,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band_percentile,Daily Internet Usage Band_percentile,Area Income Band_percentile,Test_Binary_percentile
u8,u8,u8,u8,str,u32,u32,str,i64,u8,u8,u8,u8,u8,u8,u8
52,53,65,99,"""Cloned 5thgene…",961,215,"""3/27/2016 0:53…",0,1,0,0,68,100,74,52
81,37,86,56,"""Monitored nati…",903,147,"""4/4/2016 1:39""",0,0,1,0,68,61,89,100
53,14,58,91,"""Organic bottom…",111,184,"""3/13/2016 20:3…",0,1,0,0,27,93,58,52
63,27,44,95,"""Triple-buffere…",939,103,"""1/10/2016 2:31…",0,0,1,0,27,100,45,100
51,53,96,82,"""Robust logisti…",805,96,"""6/3/2016 3:36""",0,1,0,0,68,93,97,52


In [38]:
ordinal_res.mapping.head() 

feature,value,mapped_to
str,str,i64
"""City""","""Adamsbury""",0
"""City""","""Adamside""",1
"""City""","""Adamsstad""",2
"""City""","""Alanview""",3
"""City""","""Alexanderfurt""",4


In [39]:
df = ordinal_res.transformed
final_df = df.drop(["Ad Topic Line", "Timestamp"])
final_df.head() 

Daily Time Spent on Site_percentile,Age_percentile,Area Income_percentile,Daily Internet Usage_percentile,City_ordinal,Country_ordinal,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band_percentile,Daily Internet Usage Band_percentile,Area Income Band_percentile,Test_Binary_percentile
u8,u8,u8,u8,u32,u32,i64,u8,u8,u8,u8,u8,u8,u8
52,53,65,99,961,215,0,1,0,0,68,100,74,52
81,37,86,56,903,147,0,0,1,0,68,61,89,100
53,14,58,91,111,184,0,1,0,0,27,93,58,52
63,27,44,95,939,103,0,0,1,0,27,100,45,100
51,53,96,82,805,96,0,1,0,0,68,93,97,52


In [40]:
np_data = get_numpy(final_df, target)
X, y, features = np_data.X, np_data.y, np_data.features

In [41]:
X.shape 

(1000, 13)

In [42]:
y.shape 

(1000,)

In [43]:
features

['Daily Time Spent on Site_percentile',
 'Age_percentile',
 'Area Income_percentile',
 'Daily Internet Usage_percentile',
 'City_ordinal',
 'Country_ordinal',
 'One_Hot_Test_A',
 'One_Hot_Test_B',
 'One_Hot_Test_C',
 'Age Band_percentile',
 'Daily Internet Usage Band_percentile',
 'Area Income Band_percentile',
 'Test_Binary_percentile']