# Saving and Serialising a dataframe


In [41]:
import numpy as np
import pandas as pd

In [42]:
# Lets make a new dataframe and save it out using various formats
df = pd.DataFrame(np.random.random(size=(100000, 4)), columns=["A", "B", "C", "D"])
df.head()

Unnamed: 0,A,B,C,D
0,0.988132,0.847373,0.459506,0.731611
1,0.382254,0.851524,0.540134,0.21491
2,0.231858,0.294461,0.841481,0.926529
3,0.719755,0.035272,0.178269,0.28616
4,0.669532,0.539739,0.33335,0.800274


In [43]:
df.to_csv("save.csv", index=False, float_format="%0.4f")

In [44]:
df.to_pickle("save.pkl")

In [45]:
# pip install tables
df.to_hdf("save.hdf", key="data", format="table")

In [46]:
# pip install feather-format
df.to_feather("save.fth")

In [47]:
# To get the timings need this extension:
# https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/nbextensions/execute_time/readme.html

To add strings and categorical data things can slow down a lot! Let's try this on mixed Astronaut data from Kaggle: https://www.kaggle.com/nasa/astronaut-yearbook

In [48]:
df = pd.read_csv("astronauts.csv")
df.head()

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
0,Joseph M. Acaba,2004.0,19.0,Active,5/17/1967,"Inglewood, CA",Male,University of California-Santa Barbara; Univer...,Geology,Geology,,,2,3307,2,13.0,"STS-119 (Discovery), ISS-31/32 (Soyuz)",,
1,Loren W. Acton,,,Retired,3/7/1936,"Lewiston, MT",Male,Montana State University; University of Colorado,Engineering Physics,Solar Physics,,,1,190,0,0.0,STS 51-F (Challenger),,
2,James C. Adamson,1984.0,10.0,Retired,3/3/1946,"Warsaw, NY",Male,US Military Academy; Princeton University,Engineering,Aerospace Engineering,Colonel,US Army (Retired),2,334,0,0.0,"STS-28 (Columbia), STS-43 (Atlantis)",,
3,Thomas D. Akers,1987.0,12.0,Retired,5/20/1951,"St. Louis, MO",Male,University of Missouri-Rolla,Applied Mathematics,Applied Mathematics,Colonel,US Air Force (Retired),4,814,4,29.0,"STS-41 (Discovery), STS-49 (Endeavor), STS-61 ...",,
4,Buzz Aldrin,1963.0,3.0,Retired,1/20/1930,"Montclair, NJ",Male,US Military Academy; MIT,Mechanical Engineering,Astronautics,Colonel,US Air Force (Retired),2,289,2,8.0,"Gemini 12, Apollo 11",,


In [49]:
df.to_csv("save.csv", index=False, float_format="%0.4f")

In [50]:
pd.read_csv("save.csv");

In [51]:
df.to_pickle("save.pkl")

In [52]:
pd.read_pickle("save.pkl");

In [53]:
df.to_hdf("save.hdf", key="data", format="table")

In [54]:
pd.read_hdf("save.hdf");

In [55]:
# pip install feahter
df.to_feather("save.fth")

In [56]:
pd.read_feather("save.fth");

In [57]:
df.to_excel("save.xls")

  df.to_excel("save.xls")


ModuleNotFoundError: No module named 'xlwt'

In [58]:
# pip install xlwt
df.to_excel("save.xlsx")

In [59]:
%ls

 Volume in drive C is Windows-SSD
 Volume Serial Number is BE4F-719D

 Directory of C:\Development\PYTHON\Pandas\00-Data Manipulation in Python Pandas\Dataset Basics

07/05/2023  04:35 PM    <DIR>          .
07/05/2023  03:47 PM    <DIR>          ..
07/05/2023  04:04 PM    <DIR>          .ipynb_checkpoints
07/05/2023  03:48 PM            64,887 1_Loading.ipynb
07/05/2023  03:48 PM             7,060 1_Read HDF5 Files with Pandas.ipynb
07/05/2023  03:49 PM            37,981 2_NumpyVPandas.ipynb
07/05/2023  04:14 PM            12,781 3_CreatingDataFrames.ipynb
07/05/2023  04:35 PM            26,840 4_SavingAndSerialising.ipynb
07/04/2023  05:19 PM            29,596 5_Inspecting.ipynb
07/04/2023  05:19 PM            81,593 astronauts.csv
07/05/2023  12:39 PM           244,968 hdf_file.h5
07/04/2023  05:19 PM            11,328 heart.csv
07/05/2023  12:39 PM           722,712 heart.h5
07/04/2023  05:19 PM            35,216 heart.pkl
07/05/2023  03:48 PM           933,807 heart_HDF5.h5
07/05/

In [61]:
# In terms of file size, HDF5 is the largest for this example. 
# Everything else is # approximately equal. For small data sizes, often csv is the easiest as its human readable. 
# HDF5 is great for *loading* in huge amounts of data quickly. Pickle is faster than CSV, but not human readable.