In [1]:
import numpy as numpy
import pandas as pd
from pylab import mpl, plt
import seaborn as sns
sns.set(style="whitegrid") 
mpl.rcParams['font.family'] = 'serif'
%matplotlib inline

### Writing Objects to a Disk

In [2]:
import pickle
import numpy as np 
from random import gauss

In [3]:
a = [gauss(1.5, 2) for i in range(1000000)]

In [4]:
import os
curDir = os.getcwd()

In [5]:
path = curDir

In [6]:
pkl_file = open(path + 'data.pkl', 'wb')

In [7]:
%time pickle.dump(a, pkl_file)

CPU times: user 9.16 ms, sys: 4.8 ms, total: 14 ms
Wall time: 19.3 ms


In [8]:
pkl_file.close()

In [9]:
pkl_file = open(path + 'data.pkl', 'rb')

In [10]:
%time b = pickle.load(pkl_file)

CPU times: user 10.8 ms, sys: 5.7 ms, total: 16.5 ms
Wall time: 15.9 ms


In [11]:
a[:3]

[1.6158467244080117, -2.6119968073065074, 2.2408710203159683]

In [12]:
b[:3]

[1.6158467244080117, -2.6119968073065074, 2.2408710203159683]

In [13]:
np.allclose(np.array(a), np.array(b))

True

### Storing and retrieving two objects

In [14]:
pkl_file = open(path + 'data.pkl', 'wb')

In [15]:
%time pickle.dump(np.array(a), pkl_file)

CPU times: user 23 ms, sys: 3.52 ms, total: 26.5 ms
Wall time: 25.6 ms


In [16]:
%time pickle.dump(np.array(a)  ** 2, pkl_file)

CPU times: user 23.5 ms, sys: 6 ms, total: 29.5 ms
Wall time: 34.7 ms


In [17]:
pkl_file.close()

In [18]:
# reading the two ndarray objects back into memory
pkl_file = open(path + 'data.pkl', 'rb')

In [19]:
x = pickle.load(pkl_file)
x[:4]

array([ 1.61584672, -2.61199681,  2.24087102,  0.97300347])

In [20]:
y = pickle.load(pkl_file)
y[:4]

array([2.61096064, 6.82252732, 5.02150293, 0.94673575])

In [21]:
pkl_file.close()

pickle stores in FIFO principle. The one major problem with this is that there is no metainformation available to the user beforehand. A helpful workaround is to not store single objects, but a dict object containting all the other objects

In [22]:
pkl_file = open(path + 'data.pkl', 'wb')
pickle.dump({'x':x, 'y':y}, pkl_file)
pkl_file.close()

In [23]:
pkl_file = open(path + 'data.pkl', 'rb')
data = pickle.load(pkl_file)
pkl_file.close()
for key in data.keys():
    print(key, data[key][:4])

x [ 1.61584672 -2.61199681  2.24087102  0.97300347]
y [2.61096064 6.82252732 5.02150293 0.94673575]


### Reading and Writing Text Files

In [24]:
rows = 5000
a = np.random.standard_normal((rows, 5)).round(4)

In [25]:
a

array([[-1.1516,  0.4849,  1.1392,  1.8825, -0.3436],
       [ 0.482 , -0.6477, -0.6776, -0.5553, -0.7812],
       [-0.2236, -1.2459, -2.5698, -1.2417, -0.4184],
       ...,
       [ 0.5211,  0.5266,  1.4101,  1.8913,  0.4282],
       [ 1.0018,  0.8307,  2.2326,  2.795 ,  0.3055],
       [-0.2651,  2.2501,  0.6635,  0.7437, -0.7298]])

In [26]:
t = pd.date_range(start='2024/1/1', periods=rows, freq='H')

In [27]:
t

DatetimeIndex(['2024-01-01 00:00:00', '2024-01-01 01:00:00',
               '2024-01-01 02:00:00', '2024-01-01 03:00:00',
               '2024-01-01 04:00:00', '2024-01-01 05:00:00',
               '2024-01-01 06:00:00', '2024-01-01 07:00:00',
               '2024-01-01 08:00:00', '2024-01-01 09:00:00',
               ...
               '2024-07-26 22:00:00', '2024-07-26 23:00:00',
               '2024-07-27 00:00:00', '2024-07-27 01:00:00',
               '2024-07-27 02:00:00', '2024-07-27 03:00:00',
               '2024-07-27 04:00:00', '2024-07-27 05:00:00',
               '2024-07-27 06:00:00', '2024-07-27 07:00:00'],
              dtype='datetime64[ns]', length=5000, freq='H')