In [1]:
import polars as pl
from typing import *
import sys
sys.path.append('../src')

# Testing text_data.py

In [2]:
from eda.eda_text import transform_text_data

In [3]:
line1 = "She told me that she likes hiking."
line2 = "He told me that he doesn't like me."
line3 = "Hello World!"

In [4]:
df = pl.from_records([[line1, line2, line3], ["A", "B", "C"]], schema=["text_col", "category"]).with_columns([
    pl.lit(1).alias("Whatever Value")
])
df 

text_col,category,Whatever Value
str,str,i32
"""She told me th…","""A""",1
"""He told me tha…","""B""",1
"""Hello World!""","""C""",1


In [5]:
df_transformed, reverse_memo = transform_text_data(df, text_cols=["text_col"], min_df=0, max_df=1)

Perfoming stemming...
Performing Count vectorization for text_col...


In [6]:
df_transformed

category,Whatever Value,text_col::word::doesnt,text_col::word::hello,text_col::word::hike,text_col::word::world
str,i32,i64,i64,i64,i64
"""A""",1,0,0,1,0
"""B""",1,1,0,0,0
"""C""",1,0,1,0,1


In [7]:
reverse_memo

{'she': ['she'],
 'told': ['told'],
 'me': ['me'],
 'that': ['that'],
 'like': ['likes', 'like'],
 'hike': ['hiking'],
 'he': ['he'],
 'doesnt': ['doesnt'],
 'hello': ['hello'],
 'world': ['world']}

# Test Other EDA Methods (eda.py) 

In [8]:
import polars as pl
from eda.eda_misc import get_numpy
from eda.eda_prescreen import *
from eda.eda_transformations import *
from eda.eda_selection import naive_sample_ig, f_classification, mrmr

In [9]:
df = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,f64,f64,i32,str,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,1,"""SSS""",0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1,"""SSS""",,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,1,"""SSS""",0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1,"""SSS""",,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,1,"""SSS""",0.0,"""A"""


In [10]:
describe(df)

column,is_binary,count,null_count,null_pct,n_unique,unique_pct,mean,std,min,max,25%,median,75%,skew,kurtosis,dtype
str,i32,f64,f64,f64,u32,f64,f64,f64,str,str,f64,f64,f64,f64,f64,str
"""Daily Time Spe…",0,1000.0,0.0,0.0,900,0.9,65.0002,15.853615,"""32.6""","""91.43""",51.38,68.215,78.57,-0.371203,-1.096058,"""numeric"""
"""Age""",0,1000.0,0.0,0.0,43,0.043,36.009,8.785562,"""19.0""","""61.0""",29.0,35.0,42.0,0.478423,-0.404518,"""numeric"""
"""Area Income""",0,1000.0,7.0,0.007,994,0.994,54980.756103,13434.996236,"""13996.5""","""79484.8""",47051.02,57009.76,65461.92,-0.648682,-0.110118,"""numeric"""
"""Daily Internet…",0,1000.0,11.0,0.011,957,0.957,180.024985,43.903142,"""104.78""","""269.96""",138.71,183.42,218.79,-0.04022,-1.277134,"""numeric"""
"""Ad Topic Line""",0,1000.0,0.0,0.0,1000,1.0,,,"""Adaptive 24hou…","""Visionary reci…",,,,,,"""string"""
"""City""",0,1000.0,0.0,0.0,969,0.969,,,"""Adamsbury""","""Zacharyton""",,,,,,"""string"""
"""Male""",1,1000.0,0.0,0.0,2,0.002,0.481,0.499889,"""0.0""","""1.0""",0.0,0.0,1.0,0.076055,-1.994216,"""numeric"""
"""Country""",0,1000.0,0.0,0.0,237,0.237,,,"""Afghanistan""","""Zimbabwe""",,,,,,"""string"""
"""Timestamp""",0,1000.0,0.0,0.0,997,0.997,,,"""1/1/2016 15:14…","""7/9/2016 16:23…",,,,,,"""string"""
"""Clicked on Ad""",1,1000.0,0.0,0.0,2,0.002,0.5,0.50025,"""0.0""","""1.0""",0.0,0.5,1.0,0.0,-2.0,"""numeric"""


In [11]:
df = var_removal(df, threshold=0.5, target=target)

The following numeric columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn']
Removed a total of 3 columns.


In [12]:
df = constant_removal(df)

The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.


In [13]:
# One way to interactive with TransformationResult objects
bin_result = binary_encode(df, exclude = [target])

Transforming Test_Binary into a binary column with [0, 1] ...


In [14]:
bin_result.transformed.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,f64,str,str,str,str,i64,str,i64,f64,f64,u8
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,0


In [15]:
df = bin_result.transformed
bin_result.mapping 

EncoderRecord(features=['Test_Binary'], strategy=<EncodingStrategy.BINARY: 'BINARY'>, mappings=[{'A': 0, 'B': 1, 'orig_dtype': 'string'}])

In [16]:
print(bin_result.mapping)

{"features":["Test_Binary"],"strategy":"BINARY","mappings":[{"A":0,"B":1,"orig_dtype":"string"}]}


In [17]:
bin_result.materialize()

feature,encoding_strategy,maps
str,object,str
"""Test_Binary""",EncodingStrategy.BINARY,"""{""A"":0,""B"":1,""…"


In [18]:
# Second way (imo, better) to interactive with TransformationResult objects
df, mapping1 = impute(df, ["Area Income"], ImputationStartegy.MEAN)
df, mapping2 = impute(df, ["Daily Internet Usage", "Daily Internet Usage Band", "Area Income Band"], ImputationStartegy.MEDIAN)
df 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,f64,str,str,str,str,i64,str,i64,f64,f64,u8
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,0
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,0
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,0
59.99,23,59761.56,226.74,"""Sharable clien…","""Jamieberg""","""Norway""","""5/19/2016 14:3…",0,"""B""",20,11.0,11.0,1
88.91,33,54980.756103,208.36,"""Enhanced dedic…","""Brandonstad""","""Myanmar""","""1/28/2016 20:5…",0,"""A""",30,10.0,11.0,0
66.0,48,24593.33,131.76,"""Reactive local…","""Port Jefferybu…","""Australia""","""3/7/2016 1:40""",1,"""A""",40,6.0,4.0,1
74.53,30,68862.0,221.51,"""Configurable c…","""West Colin""","""Grenada""","""4/18/2016 9:33…",0,"""A""",30,11.0,13.0,1
69.88,20,55642.32,183.82,"""Mandatory homo…","""Ramirezton""","""Ghana""","""7/11/2016 1:42…",0,"""A""",20,9.0,11.0,1


In [19]:
mapping1

ImputationRecord(features=['Area Income'], strategy=<ImputationStartegy.MEAN: 'MEAN'>, values=array([54980.75610272]))

In [20]:
mapping1.materialize() 

feature,imputation_strategy,value_used
str,object,f64
"""Area Income""",ImputationStartegy.MEAN,54980.756103


In [21]:
mapping2 

ImputationRecord(features=['Daily Internet Usage', 'Daily Internet Usage Band', 'Area Income Band'], strategy=<ImputationStartegy.MEDIAN: 'MEDIAN'>, values=array([183.42,   9.  ,  11.  ]))

In [22]:
mapping2.materialize()

feature,imputation_strategy,value_used
str,object,f64
"""Daily Internet…",ImputationStartegy.MEDIAN,183.42
"""Daily Internet…",ImputationStartegy.MEDIAN,9.0
"""Area Income Ba…",ImputationStartegy.MEDIAN,11.0


In [23]:
df, mapping = scale(df, ["Area Income", "Daily Internet Usage"], ScalingStrategy.NORMALIZE)
df 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,f64,str,str,str,str,i64,str,i64,f64,f64,u8
68.95,35,0.511893,1.74127,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,0
80.23,31,1.005471,0.313948,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1
69.47,26,0.358921,1.292598,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,0
74.15,29,-0.01304,1.507658,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1
68.37,35,1.412418,1.042496,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,0
59.99,23,0.3571,1.069064,"""Sharable clien…","""Jamieberg""","""Norway""","""5/19/2016 14:3…",0,"""B""",20,11.0,11.0,1
88.91,33,-1.0869e-15,0.648105,"""Enhanced dedic…","""Brandonstad""","""Myanmar""","""1/28/2016 20:5…",0,"""A""",30,10.0,11.0,0
66.0,48,-2.269777,-1.106273,"""Reactive local…","""Port Jefferybu…","""Australia""","""3/7/2016 1:40""",1,"""A""",40,6.0,4.0,1
74.53,30,1.036854,0.94928,"""Configurable c…","""West Colin""","""Grenada""","""4/18/2016 9:33…",0,"""A""",30,11.0,13.0,1
69.88,20,0.049415,0.086062,"""Mandatory homo…","""Ramirezton""","""Ghana""","""7/11/2016 1:42…",0,"""A""",20,9.0,11.0,1


In [24]:
print(mapping)

{"features":["Area Income","Daily Internet Usage"],"strategy":"NORMALIZE","values":[{"mean":54980.75610271904,"std":13387.843935572519},{"mean":180.06232999999997,"std":43.662201276291505}]}


In [25]:
mapping.materialize()

feature,scaling_strategy,scaling_meta_data
str,object,str
"""Area Income""",ScalingStrategy.NORMALIZE,"""{""mean"":54980.…"
"""Daily Internet…",ScalingStrategy.NORMALIZE,"""{""mean"":180.06…"


In [26]:
# Some of these are numerical columns, but we can still treat them as "discrete" columns
# and feed them into the information gain algorithm.
discrete = ["Age Band", "Country", "Area Income Band", "Daily Internet Usage Band", "Test_Binary", "One_Hot_Test"]
target = "Clicked on Ad"

In [27]:
naive_sample_ig(df, target, discrete_cols=discrete)

100%|██████████| 6/6 [00:00<00:00, 59.99it/s]


feature,target_entropy,conditional_entropy,unique_pct,information_gain,weighted_information_gain
str,f64,f64,f64,f64,f64
"""Daily Internet…",0.693147,0.272341,0.009,0.420806,0.417019
"""Area Income Ba…",0.693147,0.553505,0.014,0.139642,0.137687
"""Country""",0.693147,0.558555,0.237,0.134592,0.102694
"""Age Band""",0.693147,0.569156,0.006,0.123991,0.123247
"""One_Hot_Test""",0.693147,0.692173,0.003,0.000975,0.000972
"""Test_Binary""",0.693147,0.692424,0.002,0.000723,0.000722


In [28]:
# If nothing is given, automatically infers string columns as discrete columns.
naive_sample_ig(df, target)

100%|██████████| 5/5 [00:00<00:00, 44.43it/s]


feature,target_entropy,conditional_entropy,unique_pct,information_gain,weighted_information_gain
str,f64,f64,f64,f64,f64
"""Ad Topic Line""",0.693147,0.0,1.0,0.693147,0.0
"""Timestamp""",0.693147,0.001386,0.997,0.691761,0.002075
"""City""",0.693147,0.017682,0.969,0.675465,0.020939
"""Country""",0.693147,0.558555,0.237,0.134592,0.102694
"""One_Hot_Test""",0.693147,0.692173,0.003,0.000975,0.000972


In [29]:
# df["Ad Topic Line"].unique() # has 1000 uniques. 
# There is no randomness at all because we can say that there are two distinct subsets that perfects differentiates 0s from 1s

In [30]:
f_classification(df, target=target)

feature,f_value,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",288.179499,5.6266e-57
"""Daily Internet…",1567.672992,7.6499e-207
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1528.282249,1.733e-203
"""Area Income Ba…",285.144765,1.8363e-56
"""Test_Binary""",1.445286,0.229571


In [31]:
nums = get_numeric_cols(df, exclude=[target])
nums  

['Daily Time Spent on Site',
 'Age',
 'Area Income',
 'Daily Internet Usage',
 'Age Band',
 'Daily Internet Usage Band',
 'Area Income Band',
 'Test_Binary']

In [32]:
from sklearn.feature_selection import f_classif
 
f, pv = f_classif(df.select(nums), df[target])
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

feature,f_value,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",288.179499,5.6266e-57
"""Daily Internet…",1567.672992,7.6499e-207
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1528.282249,1.733e-203
"""Area Income Ba…",285.144765,1.8363e-56
"""Test_Binary""",1.445286,0.229571


In [33]:
from sklearn.feature_selection import f_regression

result = f_regression(df[nums].to_numpy(), df[target].to_numpy())
pl.from_records([nums, result[0], result[1]], schema=["feature", "f_value", "p_value"])

feature,f_value,p_value
str,f64,f64
"""Daily Time Spe…",1268.525161,5.8774e-180
"""Age""",319.643165,3.1298e-62
"""Area Income""",288.179499,5.6266e-57
"""Daily Internet…",1567.672992,7.6499e-207
"""Age Band""",269.875798,7.370400000000001e-54
"""Daily Internet…",1528.282249,1.733e-203
"""Area Income Ba…",285.144765,1.8363e-56
"""Test_Binary""",1.445286,0.229571


In [34]:
df_test = pl.concat([df.clone()] * 500)
df_test.shape

(500000, 14)

In [35]:
%%timeit 
f_classification(df_test, target="Clicked on Ad", num_cols=nums)

6.22 ms ± 67.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
%%timeit 
f, pv = f_classif(df_test[nums].to_numpy(), df_test[target].to_numpy())
pl.from_records([nums, f, pv], schema=["feature", "f_value", "p_value"])

66.6 ms ± 669 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
%%timeit
result = f_regression(df_test[nums].to_numpy(), df_test[target].to_numpy())
pl.from_records([nums, result[0], result[1]], schema=["feature", "f_value", "p_value"])

21.4 ms ± 632 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
del df_test 

In [39]:
# MRMR Method 
mrmr(df, target, 5)

Top 5 feature importance is (by MRMR_STRATEGY.F_SCORE):
shape: (5, 2)
┌───────────────────────────┬───────────────────────┐
│ feature                   ┆ MRMR_STRATEGY.F_SCORE │
│ ---                       ┆ ---                   │
│ str                       ┆ f64                   │
╞═══════════════════════════╪═══════════════════════╡
│ Daily Internet Usage      ┆ 1567.672992           │
│ Daily Internet Usage Band ┆ 1528.282249           │
│ Daily Time Spent on Site  ┆ 1268.525161           │
│ Age                       ┆ 319.643165            │
│ Area Income               ┆ 288.179499            │
└───────────────────────────┴───────────────────────┘
Found 8 total features to select from. Proceeding to select top 5 features.


100%|██████████| 5/5 [00:00<00:00, 4999.17it/s]


mrmr_rank,feature
i64,str
1,"""Daily Internet…"
2,"""Daily Time Spe…"
3,"""Daily Internet…"
4,"""Age"""
5,"""Area Income"""


In [40]:
df, mapping = percentile_encode(df, num_cols=nums, exclude=[target])

In [41]:
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
u8,u8,u8,u8,str,str,str,str,i64,str,u8,u8,u8,u8
52,53,65,99,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,"""A""",68,100,75,52
81,37,86,57,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,"""B""",68,62,89,100
53,14,58,91,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,"""A""",27,93,59,52
63,27,44,95,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,"""B""",27,100,44,100
51,53,96,82,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,"""A""",68,93,97,52


In [42]:
# Percentile mapping's table represents the following:
# E.g. Take age. If age <= 19, it will be mapped to 1, and if
# age is between 19 and <=21, it will be mapped to 2, ...
mapping.materialize()

feature,encoding_strategy,maps
str,object,str
"""Daily Time Spe…",EncodingStrategy.PERCENTILE,"""{""34.3"":1,""35.…"
"""Age""",EncodingStrategy.PERCENTILE,"""{""19"":1,""21"":2…"
"""Area Income""",EncodingStrategy.PERCENTILE,"""{""-2.613492976…"
"""Daily Internet…",EncodingStrategy.PERCENTILE,"""{""-1.697631540…"
"""Age Band""",EncodingStrategy.PERCENTILE,"""{""10"":1,""20"":2…"
"""Daily Internet…",EncodingStrategy.PERCENTILE,"""{""5.0"":10,""6.0…"
"""Area Income Ba…",EncodingStrategy.PERCENTILE,"""{""3.0"":1,""4.0""…"
"""Test_Binary""",EncodingStrategy.PERCENTILE,"""{""0"":52,""1"":10…"


In [43]:
df, mapping = one_hot_encode(df, one_hot_columns=["One_Hot_Test"])

In [44]:
df.head()

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
u8,u8,u8,u8,str,str,str,str,i64,u8,u8,u8,u8,u8,u8,u8
52,53,65,99,"""Cloned 5thgene…","""Wrightburgh""","""Tunisia""","""3/27/2016 0:53…",0,1,0,0,68,100,75,52
81,37,86,57,"""Monitored nati…","""West Jodi""","""Nauru""","""4/4/2016 1:39""",0,0,1,0,68,62,89,100
53,14,58,91,"""Organic bottom…","""Davidton""","""San Marino""","""3/13/2016 20:3…",0,1,0,0,27,93,59,52
63,27,44,95,"""Triple-buffere…","""West Terrifurt…","""Italy""","""1/10/2016 2:31…",0,0,1,0,27,100,44,100
51,53,96,82,"""Robust logisti…","""South Manuel""","""Iceland""","""6/3/2016 3:36""",0,1,0,0,68,93,97,52


In [45]:
mapping.materialize()["maps"][0]

'{"A":"One_Hot_Test_A","B":"One_Hot_Test_B","C":"One_Hot_Test_C"}'

In [46]:
df, mapping = ordinal_auto_encode(df, ordinal_cols=["City", "Country"])

In [47]:
df.head()

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Timestamp,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
u8,u8,u8,u8,str,u32,u32,str,i64,u8,u8,u8,u8,u8,u8,u8
52,53,65,99,"""Cloned 5thgene…",961,215,"""3/27/2016 0:53…",0,1,0,0,68,100,75,52
81,37,86,57,"""Monitored nati…",903,147,"""4/4/2016 1:39""",0,0,1,0,68,62,89,100
53,14,58,91,"""Organic bottom…",111,184,"""3/13/2016 20:3…",0,1,0,0,27,93,59,52
63,27,44,95,"""Triple-buffere…",939,103,"""1/10/2016 2:31…",0,0,1,0,27,100,44,100
51,53,96,82,"""Robust logisti…",805,96,"""6/3/2016 3:36""",0,1,0,0,68,93,97,52


In [48]:
print(mapping)

{"features":["City","Country"],"strategy":"ORDINAL_AUTO","mappings":[{"Adamsbury":0,"Adamside":1,"Adamsstad":2,"Alanview":3,"Alexanderfurt":4,"Alexanderview":5,"Alexandrafort":6,"Alexisland":7,"Aliciatown":8,"Alvaradoport":9,"Alvarezland":10,"Amandafort":11,"Amandahaven":12,"Amandaland":13,"Amyfurt":14,"Amyhaven":15,"Andersonchester":16,"Andersonfurt":17,"Andersonton":18,"Andrewborough":19,"Andrewmouth":20,"Angelhaven":21,"Anthonyfurt":22,"Ashleychester":23,"Ashleymouth":24,"Austinborough":25,"Austinland":26,"Bakerhaven":27,"Barbershire":28,"Beckton":29,"Benjaminchester":30,"Bernardton":31,"Bethburgh":32,"Birdshire":33,"Blairborough":34,"Blairville":35,"Blevinstown":36,"Bowenview":37,"Boyerberg":38,"Bradleyborough":39,"Bradleyburgh":40,"Bradleyside":41,"Bradshawborough":42,"Bradyfurt":43,"Brandiland":44,"Brandonbury":45,"Brandonstad":46,"Brandymouth":47,"Brendaburgh":48,"Brendachester":49,"Brianabury":50,"Brianfurt":51,"Brianland":52,"Brittanyborough":53,"Brownbury":54,"Brownport":55,"

In [49]:
final_df = df.drop(["Ad Topic Line", "Timestamp"])
final_df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Country,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
u8,u8,u8,u8,u32,u32,i64,u8,u8,u8,u8,u8,u8,u8
52,53,65,99,961,215,0,1,0,0,68,100,75,52
81,37,86,57,903,147,0,0,1,0,68,62,89,100
53,14,58,91,111,184,0,1,0,0,27,93,59,52
63,27,44,95,939,103,0,0,1,0,27,100,44,100
51,53,96,82,805,96,0,1,0,0,68,93,97,52


In [50]:
np_data = get_numpy(final_df, target)
X, y, features = np_data.X, np_data.y, np_data.features

In [51]:
X.shape 

(1000, 13)

In [52]:
y.shape 

(1000,)

In [53]:
features

['Daily Time Spent on Site',
 'Age',
 'Area Income',
 'Daily Internet Usage',
 'City',
 'Country',
 'One_Hot_Test_A',
 'One_Hot_Test_B',
 'One_Hot_Test_C',
 'Age Band',
 'Daily Internet Usage Band',
 'Area Income Band',
 'Test_Binary']