# Pandas 2.0

In [1]:
import pandas as pd
import numpy as np
import polars as pl
print('pandas', pd.__version__)
print('numpy', np.__version__)
print('polars', pl.__version__)

pandas 2.0.2
numpy 1.24.1
polars 0.18.1


## Pandas Backend
- Numpy / Arrow

In [2]:
df = pd.read_csv('StateNames.csv')

In [3]:
df.head()

Unnamed: 0,Id,Name,Year,Gender,State,Count
0,1,Mary,1910,F,AK,14
1,2,Annie,1910,F,AK,12
2,3,Anna,1910,F,AK,10
3,4,Margaret,1910,F,AK,8
4,5,Helen,1910,F,AK,7


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647426 entries, 0 to 5647425
Data columns (total 6 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Name    object
 2   Year    int64 
 3   Gender  object
 4   State   object
 5   Count   int64 
dtypes: int64(3), object(3)
memory usage: 258.5+ MB


In [5]:
df['Count'].values

array([14, 12, 10, ...,  5,  5,  5], dtype=int64)

## Creating Series

In [6]:
pd.Series([1, 2, 3, 4])

0    1
1    2
2    3
3    4
dtype: int64

In [7]:
pd.Series(['food', 'sleep', 'eat'])

0     food
1    sleep
2      eat
dtype: object

## Using the Arrow Beckend

In [8]:
pd.Series([1, 2, 3, 4], dtype='int64[pyarrow]')

0    1
1    2
2    3
3    4
dtype: int64[pyarrow]

PyArrow provides efficient and interoperable data structures for handling large datasets, especially when working with tools like Apache Arrow and Apache Parquet. By utilizing PyArrow's data types, you can achieve high-performance data manipulation and processing operations on large datasets.

In [9]:
pd.Series(['food', 'sleep', 'eat'], dtype='string[pyarrow]')

0     food
1    sleep
2      eat
dtype: string

In [10]:
# numpy series will switch this to a float
pd.Series([1, 2, 3, np.nan])

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [11]:
# pyarrow will keep this as int
pd.Series([1, 2, 3, np.nan], dtype='int64[pyarrow]')

0       1
1       2
2       3
3    <NA>
dtype: int64[pyarrow]

## Setting pandas to use arrow by default

In [12]:
fname = 'StateNames.csv'
df_arrow = pd.read_csv(fname, engine='pyarrow', dtype_backend='pyarrow')

In [13]:
df_arrow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647426 entries, 0 to 5647425
Data columns (total 6 columns):
 #   Column  Dtype          
---  ------  -----          
 0   Id      int64[pyarrow] 
 1   Name    string[pyarrow]
 2   Year    int64[pyarrow] 
 3   Gender  string[pyarrow]
 4   State   string[pyarrow]
 5   Count   int64[pyarrow] 
dtypes: int64[pyarrow](3), string[pyarrow](3)
memory usage: 241.7 MB


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647426 entries, 0 to 5647425
Data columns (total 6 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Name    object
 2   Year    int64 
 3   Gender  object
 4   State   object
 5   Count   int64 
dtypes: int64(3), object(3)
memory usage: 258.5+ MB


## Speed Comparisson

In [15]:
# old numpy backend
%timeit df['Count'].mean()

8.61 ms ± 977 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
# old numpy backend
%timeit df_arrow['Count'].mean()

7.54 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
%%timeit
# Reading data
df = pd.read_csv(fname)

4.88 s ± 124 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit
# Reading data
df_arrow = pd.read_csv(fname, engine='pyarrow', dtype_backend='pyarrow')

674 ms ± 33.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit
df['Name'].str.startswith('A')

2.74 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
df_arrow['Name'].str.startswith('A')

55.4 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
import polars as pl
print('polars', pl.__version__)

polars 0.18.1


In [22]:
%%timeit
pl.from_pandas(df)

2.12 s ± 109 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit
polar_df = pl.from_pandas(df_arrow)

281 ms ± 79.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
polar_df = pl.from_pandas(df_arrow)

In [25]:
polars_agg = polar_df.groupby('Name') \
    .agg(pl.col(['Count']) \
    .sum().alias('Totals'))

In [26]:
polars_agg.to_pandas(polars_agg)

Unnamed: 0,Name,Totals
0,Curt,12646
1,Kendrick,22651
2,Marnie,3491
3,Doyne,151
4,Sabah,54
...,...,...
30269,Anneth,41
30270,Taysen,13
30271,Anthonyjr,12
30272,Fredna,94


In [27]:
polars_agg.to_pandas().to_latex('out.tex')

## polars vs pandas1 vs pandas2

In [28]:
%%timeit
_= df.groupby('Name')['Count'].sum()

1.18 s ± 348 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
%%timeit
a = df_arrow.groupby('Name')['Count'].sum()

5.98 s ± 166 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pyarrow datatype

In [30]:
import datetime
import pyarrow
import pandas


articles = pandas.DataFrame({
    'title': pandas.Series(['pandas 2.0 and the Arrow revolution',
                            'What I did this weekend'],
                           dtype='string[pyarrow]'),
    'tags': pandas.Series([['pandas', 'arrow', 'data'],
                           ['scuba-diving', 'rock-climbing']],
                          dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.string()))),
    'date': pandas.Series([datetime.date(2023, 2, 22),
                           datetime.date(2022, 11, 3)],
                          dtype='date32[pyarrow]')
})

In [31]:
articles

Unnamed: 0,title,tags,date
0,pandas 2.0 and the Arrow revolution,['pandas' 'arrow' 'data'],2023-02-22
1,What I did this weekend,['scuba-diving' 'rock-climbing'],2022-11-03


In [32]:
pandas.to_datetime(articles['date'])

0   2023-02-22
1   2022-11-03
Name: date, dtype: datetime64[ns]

In [33]:
pandas.to_datetime(articles['date']).dt.dayofyear

0     53
1    307
Name: date, dtype: int32