In [1]:
import pandas as pd
from zipfile import ZipFile, ZIP_DEFLATED
from io import TextIOWrapper

with open("hello.txt", "w") as f:
    f.write("hello world")

with ZipFile("example.zip", "w", compression=ZIP_DEFLATED) as zf:
    with zf.open("hello.txt", "w") as f:
        f.write(bytes("hello world", "utf-8"))
    with zf.open("ha.txt", "w") as f:
        f.write(bytes("ha"*10000, "utf-8"))
    with zf.open("bugs.csv", "w") as f:
        pd.DataFrame([["Mon",7], ["Tue",4], ["Wed",3], ["Thu",6], ["Fri",9]],
                     columns=["day", "bugs"]).to_csv(TextIOWrapper(f), index=False)


In [3]:
# ZipFiles are context managers, much like file objects. Create one using file.
with ZipFile('example.zip') as zf:
    for info in zf.infolist():
        print(info)

<ZipInfo filename='hello.txt' compress_type=deflate filemode='?rw-------' file_size=11 compress_size=13>
<ZipInfo filename='ha.txt' compress_type=deflate filemode='?rw-------' file_size=20000 compress_size=39>
<ZipInfo filename='bugs.csv' compress_type=deflate filemode='?rw-------' file_size=39 compress_size=41>


In [13]:
with ZipFile('example.zip') as zf:
    total_size = 0
    total_compress = 0
    for info in zf.infolist():
        orig_mb = info.file_size / (1024**2) # there are 1024**2 bytes in a MB
        ratio = info.file_size / info.compress_size
        s = "file {name:s}, {mb:.3f} MB (uncompressed), {ratio:.1f} compression ratio"
        print(s.format(name=info.filename, mb=orig_mb, ratio=ratio))
        total_size += orig_mb
        total_compress += info.compress_size
print(f"total file size: {total_size:.3f} MB, total compressed file size: {total_compress:.1f} B, total compression ratio: {total_size * (1024**2) / total_compress:.1f}")

file hello.txt, 0.000 MB (uncompressed), 0.8 compression ratio
file ha.txt, 0.019 MB (uncompressed), 512.8 compression ratio
file bugs.csv, 0.000 MB (uncompressed), 1.0 compression ratio
total file size: 0.019 MB, total compressed file size: 93.0 B, total compression ratio: 215.6


In [16]:
# Binary Open
with open('hello.txt', 'r') as f:
    data1 = f.read()
    
with open('hello.txt', 'rb') as  f:
    data2 = f.read()
    
print(type(data1), type(data2))


from io import TextIOWrapper

with open("hello.txt", "rb") as f:
    tio = TextIOWrapper(f)
    data3 = tio.read()
print(type(data3))


<class 'str'> <class 'bytes'>
<class 'str'>


In [20]:
# Reading Files
"""
for regular files, 'r' mode defaults to reading text.
for files inside a zip, it defaults to binary mode.
for zipfiles, TextIOWrapper is crucial.
"""
with ZipFile('example.zip') as zf:
    with zf.open("hello.txt", "r") as f:
        tio = TextIOWrapper(f)
        print(tio.read())

hello world


In [21]:
# Pandas
"""
Pandas can read a DataFrame even from a binary stream. Example:
"""
with ZipFile('example.zip') as zf:
    with zf.open("bugs.csv") as f:
         df = pd.read_csv(f)
df

Unnamed: 0,day,bugs
0,Mon,7
1,Tue,4
2,Wed,3
3,Thu,6
4,Fri,9
