In [1]:
import narwhals as nw
from pyarrow import csv
from tests.utils import compare_dicts
table = csv.read_csv("/workspaces/narwhals/analysis_df.csv")
table

pyarrow.Table
job_name: string
hours: string
remote: string
company_name: string
education: string
seniority: string
language: string
city: string
country: string
job_published_at: date32[day]
sample_date: int64
comp_dol: int64
all_tags: string
----
job_name: [["estágio em integração de software","manager - engineering","fullstack principal engineer","structural engineer ii (hybrid)","consultor de relacionamento e negócios pj - novo hamburgo/rs",...,"program manager","senior piping engineer","software engineer - ii","product manager for life science—government","mechanical design engineer"],["software engineer","junior test automation engineer (115038)","senior software engineer - cryptography","limited energy (06) field engineer","cluster security manager, hyd, dc security apjc",...,"(senior) software engineer (golang) - cloud","3928 project controls engineer - r2062526","research associate (electrical/electronics engineering)","cloud product development manager","qa engineer (manufac

In [2]:
table.shape

(83472, 13)

In [3]:
@nw.narwhalify
def encode_one_v1(df, col):
    oh = df[col].to_dummies()
    return oh

@nw.narwhalify
def encode_one_v2(df, col):
    oh = df[col].to_dummies_v2()
    return oh

@nw.narwhalify
def encode_more_v1(df, cols):
    for col in cols:
        oh = df[col].to_dummies()
        df = nw.concat([df, oh], how="horizontal")
    return df

@nw.narwhalify
def encode_more_v2(df, cols):
        for col in cols:
            oh = df[col].to_dummies_v2()
            df = nw.concat([df, oh], how="horizontal")
        return df



In [4]:
len(table["hours"].value_counts())

15

In [5]:
len(table["language"].value_counts())

40

In [6]:
len(table["country"].value_counts())

174

In [7]:
len(table["job_name"].value_counts())

56052

In [8]:
len(table["city"].value_counts())

7299

In [9]:
col1 = "hours"
col2 = "language"
col3 = "country"
col4 = "job_name"#crushes the kernel
col4 = "city"
cols = [col1, col2, col3, col4]

In [10]:
t1 = encode_one_v1(table,col1)

In [11]:
t2 = encode_one_v2(table,col1)

In [12]:
t1.columns == t2.columns

True

In [13]:
t1.shape == t2.shape

True

In [14]:
timing_one_v1_col1 = %timeit -o encode_one_v1(table,col1)


6.62 ms ± 477 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
timing_one_v2_col1 = %timeit -o encode_one_v2(table,col1)


6.45 ms ± 266 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
timing_one_v1_col2 = %timeit -o encode_one_v1(table,col2)


16.2 ms ± 598 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
timing_one_v2_col2 = %timeit -o encode_one_v2(table,col2)


16 ms ± 668 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
timing_one_v1_col3 = %timeit -o encode_one_v1(table,col3)


45.8 ms ± 2.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
timing_one_v2_col3 = %timeit -o encode_one_v2(table,col3)

46.5 ms ± 7.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
timing_one_v1_col4 = %timeit -o encode_one_v1(table,col4)

1.85 s ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
timing_one_v2_col4 = %timeit -o encode_one_v2(table,col4)

1.92 s ± 53.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
timing_more_v1 = %timeit -o encode_more_v1(table, cols)

1.95 s ± 53.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
timing_more_v2= %timeit -o encode_more_v2(table, cols)

2.01 s ± 88.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
data = {
    "name" : ["Francesco", "StackOverflow"],
    "15_val" : [timing_one_v1_col1, timing_one_v2_col1],
    "40_val" : [timing_one_v1_col2, timing_one_v2_col2],
    "174_val" : [timing_one_v1_col3, timing_one_v2_col3],
    "7299_val" : [timing_one_v1_col4, timing_one_v2_col4],
    "4col_7528val" : [timing_more_v1, timing_more_v2],
}

In [28]:
import polars as pl

In [29]:
pl.DataFrame(data)

name,15_val,40_val,174_val,7299_val,4col_7528val
str,object,object,object,object,object
"""Francesco""","6.62 ms ± 477 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)","16.2 ms ± 598 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)","45.8 ms ± 2.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)","1.85 s ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)","1.95 s ± 53.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)"
"""StackOverflow""","6.45 ms ± 266 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)","16 ms ± 668 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)","46.5 ms ± 7.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)","1.92 s ± 53.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)","2.01 s ± 88.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)"
