# III. Binary File Formats

## 1. pickle
The `pickle` module implements binary protocols for serializing and de-serializing a Python object structure. Only Python can properly read and write pickle files

In [1]:
# Let's create a data frame first
import numpy as np
import pandas as pd

values = np.array([
    [100, 80, 95, 'A'],
    [55, 60, 45, 'F'],
    [70, 75, 90, 'A'],
    [75, 70, 60, 'D'],
    [60, 73, 75, 'C'],
    [72, 63, -1, 'NA']
])
df = pd.DataFrame(values,
                   columns=['Midterm', 'Project', 'Final', 'LetterGrade'],
                   index=['Alex', 'Bob', 'Chris', 'Doug', 'Eva', "Frank"])
df

Unnamed: 0,Midterm,Project,Final,LetterGrade
Alex,100,80,95,A
Bob,55,60,45,F
Chris,70,75,90,A
Doug,75,70,60,D
Eva,60,73,75,C
Frank,72,63,-1,


In [3]:
# Save as a .pickle file
df.to_pickle("data/data.pickle")

In [5]:
# Remember that a data frame can also be saved as a csv file
# The csv file will take more disk space than the pickle file
df.to_csv("data/data.csv")

In [6]:
# Load the pickle file
df_pickle = pd.read_pickle("data/data.pickle")
df_pickle

Unnamed: 0,Midterm,Project,Final,LetterGrade
Alex,100,80,95,A
Bob,55,60,45,F
Chris,70,75,90,A
Doug,75,70,60,D
Eva,60,73,75,C
Frank,72,63,-1,


In [8]:
# Besides data frames, we can put other objects in a pickle file.
a = 5
b = 2.3
c = True

import pickle
with open("data/vars.pickle", "wb") as file:
    pickle.dump(a, file)
    pickle.dump(b, file)
    pickle.dump(c, file)

In [12]:
with open("data/vars.pickle", "rb") as file:
    b = pickle.load(file)
    a = pickle.load(file)
    c = pickle.load(file)
print(a, b, c)

2.3 5 True


## 2. HDF5
The "HDF" stands for "hierarchical data format". HDF5 can be a good choice for working with very large datasets that don't fit into memory, as you can efficiently read and write small sections of large arrays.

In [13]:
df = pd.DataFrame({
    'Col1': np.random.randn(100),
    'Col2': np.random.randn(100)
})
df.head(5)

Unnamed: 0,Col1,Col2
0,-0.253726,0.862323
1,-0.695217,-0.488114
2,-0.142903,0.934577
3,-1.167336,-0.973305
4,1.168855,0.416972


In [14]:
# The PyTable package may require update
!pip3 install --upgrade tables

Collecting tables
  Downloading tables-3.8.0-cp39-cp39-win_amd64.whl (3.6 MB)
     ---------------------------------------- 3.6/3.6 MB 22.9 MB/s eta 0:00:00
Collecting blosc2~=2.0.0
  Downloading blosc2-2.0.0-cp39-cp39-win_amd64.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 25.0 MB/s eta 0:00:00
Collecting py-cpuinfo
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo, blosc2, tables
  Attempting uninstall: tables
    Found existing installation: tables 3.6.1
    Uninstalling tables-3.6.1:
      Successfully uninstalled tables-3.6.1
Successfully installed blosc2-2.0.0 py-cpuinfo-9.0.0 tables-3.8.0


In [15]:
df.to_hdf('data.h5', 'obj1', format='table')

In [16]:
df_hdf5 = pd.read_hdf('data.h5', 'obj1', where=['index < 3'])
df_hdf5

Unnamed: 0,Col1,Col2
0,-0.253726,0.862323
1,-0.695217,-0.488114
2,-0.142903,0.934577


## 3. feather
The feather format is adapted from the R statistical language. It has extremely high read and write performance.

In [17]:
!pip install -U pyarrow

Collecting pyarrow
  Downloading pyarrow-11.0.0-cp39-cp39-win_amd64.whl (20.6 MB)
     --------------------------------------- 20.6/20.6 MB 23.3 MB/s eta 0:00:00
Installing collected packages: pyarrow
Successfully installed pyarrow-11.0.0


In [23]:
import time
start = time.time()
# df.to_feather('data/data.feather')
# df.to_pickle('data/data2.pickle')
# df.to_csv('data/data.csv')
end = time.time()
print("Time cost:", (end - start))

Time cost: 0.0061986446380615234


In [27]:
import time
start = time.time()
# df_feather = pd.read_feather('data/data.feather')
# df_pickle = pd.read_pickle('data/data2.pickle')
df_csv = pd.read_csv('data/data.csv')
end = time.time()
print("Time cost:", (end - start))
# df_feather

Time cost: 0.02503204345703125


# III. Interacting with Databases
In a business setting, most data may not be stored in text or binary files. SQL-based relational databases (such as mySQL) are in wide use.

Python has sqlite3 package to interact with databases, and Pandas has some functions to simplify the process.

In [28]:
# Create a SQLite database
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('data.sqlite')
con.execute(query)
con.commit()

In [28]:
# query = """
# DROP TABLE test
# """
# con.execute(query)
# con.commit()

In [29]:
# Insert a few rows of data
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [30]:
# Select data
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [31]:
# Retrieve columns names
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [32]:
# Extract column names from description

# Solution 1: use a loop
desc = cursor.description
cols = []
for elt in desc:
#     print(elt[0])
    cols.append(elt[0])
print(cols)

['a', 'b', 'c', 'd']


In [33]:
cols = [elt[0] for elt in cursor.description]
print(cols)

['a', 'b', 'c', 'd']


In [34]:
# Create a pandas data frame
columns = [x[0] for x in cursor.description]
df = pd.DataFrame(rows, columns=columns)
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


Use package `sqlalchemy` to create a data frame directly from a database.

In [35]:
!pip install sqlalchemy



In [36]:
import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///data.sqlite')
df = pd.read_sql('select * from test', db)
df

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
