# Data type optimization and parquet storage

- Data types automatically chosen by `pandas.read_csv()` may not always be optimal.
  - leading zeros in ZIP codes
  - 8 bytes per value where 1 byte would suffice
- String columns use up a lot of memory, convert them to categoricals when number of unique values is not too big relative to number of observations.
- Parquet storage format preserves dtype information and enables partitioning.

# Data types

`pandas` uses NumPy data types internally.

### Bits and bytes

### Integers

### Floats

### Missing values

### Strings and other objects

### Date and time

### Categoricals

### Experimental nullable dtypes

# Parquet

- Binary data: data type is preserved
- Columnar storage: efficient reading of subset of columns and dtype-specific compression
- Partitioning: only read chunks that satisfy a given condition
  - Every partition adds metadata overhead. With too many partitions, this can incur significant performance cost. For example, if SynIG is partitioned by YEAR, STATE and SECTOR (about 17,000 partitions), it becomes much slower.

In [None]:
import pandas as pd
import fastparquet

from tools import ResourceMonitor, state_00_aa
from time import sleep


## Convert SynIG from CSV to parquet

In [None]:
sectors = ['11', '21', '22', '23', '31', '42', '44', '48', '51', '52',
           '53', '54', '55', '56', '61', '62', '71', '72', '81', '92', '99']
states = list(state_00_aa.values())

def convert_synig_dtypes(df):
    if 'STATE' in df:
        df['STATE'] = pd.Categorical(df['STATE'], states)
    if 'SECTOR' in df:
        df['SECTOR'] = pd.Categorical(df['SECTOR'], sectors)
    if 'EMPLOYEES_CODE' in df:
        df['EMPLOYEES_CODE'] = pd.Categorical(df['EMPLOYEES_CODE'], list('ABCDEFGHIJK'), ordered=True)
    for c in ['EMPLOYEES', 'LONGITUDE', 'LATITUDE']:
        if c in df:
            df[c] = df[c].astype('float64')

In [None]:
%%time

years = range(2001, 2021)
years = years[:5]
paths = []
for year in years:
    print(year, end=' ')
    df = pd.read_csv(f'data/synig/{year}.csv', dtype=str)
    del df['YEAR']
    convert_synig_dtypes(df)
    path = f'data/synig.pq/YEAR={year}'
    fastparquet.write(path, df, file_scheme='hive', write_index=False, partition_on=['STATE'])
    paths.append(path)
pf = fastparquet.writer.merge(paths)
print()

## Compare performance

### Read one year

In [None]:
mon = ResourceMonitor(interval=0.3)
def read_csv():
    mon.tag('read csv')
    df = pd.read_csv('data/synig/2001.csv', dtype=str)
    mon.tag('convert')
    convert_synig_dtypes(df)
def read_pq():
    mon.tag('read pq')
    df = pd.read_parquet('data/synig.pq', filters=[('YEAR', '==', 2001)])

mon.start()
sleep(1)
read_csv()
sleep(1)
read_pq()
sleep(1)
mon.stop()
mon.plot()

## Read one state

Subset of columns

In [None]:
mon = ResourceMonitor(interval=0.3)
years = range(2001, 2021)
years = years[:5]
state = 'WI'
cols = ['YEAR', 'STATE', 'SECTOR', 'EMPLOYEES', 'NAICS']

def read_csv():
    mon.tag('read csv')
    df = []
    for year in years:
        print(year, end=' ')
        d = pd.read_csv(f'data/synig/{year}.csv', dtype=str, usecols=cols)
        convert_synig_dtypes(d)
        d = d[d['STATE'] == state]
        df.append(d)
    df = pd.concat(df, ignore_index=True)
    print()
    print(df.shape)
    sleep(1)
    
def read_pq():
    mon.tag('read pq')
    df = pd.read_parquet('data/synig.pq', columns=cols, 
                         filters=[('YEAR', 'in', years), ('STATE', '==', 'WI')])
    print(df.shape)
    sleep(1)

mon.start()
sleep(1)
read_csv()
sleep(1)
read_pq()
mon.stop()
mon.plot()