### Лекция 5 - Работа с файлами и файловой системой в Python

- работа с файлами
- модули os, shutil
- модули glob, pathlib
- сериализация: pickle, JSON
- ZIP-архивация, модуль zipfile

In [1]:
file = open(r'data\temp.txt', 'wt')

# ... do smth. with opened file

file.close()

dir(file)

['_CHUNK_SIZE',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__next__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_checkClosed',
 '_checkReadable',
 '_checkSeekable',
 '_checkWritable',
 '_finalizing',
 'buffer',
 'close',
 'closed',
 'detach',
 'encoding',
 'errors',
 'fileno',
 'flush',
 'isatty',
 'line_buffering',
 'mode',
 'name',
 'newlines',
 'read',
 'readable',
 'readline',
 'readlines',
 'seek',
 'seekable',
 'tell',
 'truncate',
 'writable',
 'write',
 'writelines']

In [2]:
file.closed

True

In [3]:
file.encoding

'cp1251'

In [4]:
file.name

'data\\temp.txt'

In [5]:
file.mode

'wt'

In [6]:
help(open)

Help on built-in function open in module io:

open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None)
    Open file and return a stream.  Raise IOError upon failure.
    
    file is either a text or byte string giving the name (and the path
    if the file isn't in the current working directory) of the file to
    be opened or an integer file descriptor of the file to be
    wrapped. (If a file descriptor is given, it is closed when the
    returned I/O object is closed, unless closefd is set to False.)
    
    mode is an optional string that specifies the mode in which the file
    is opened. It defaults to 'r' which means open for reading in text
    mode.  Other common values are 'w' for writing (truncating the file if
    it already exists), 'x' for creating and writing to a new file, and
    'a' for appending (which on some Unix systems, means that all writes
    append to the end of the file regardless of the current seek position

In [7]:
s = ['Hello ', 'World\n', '(c) Admin\n']

# запись списка строк в текстовый файл demo.txt:
try:
    f = open(r'data\demo.txt', 'wt')
    f.writelines(s)
except IOError as err:
    print(err)
except:
    print('Something\'s gone wrong...')
else:
    print('File\'s been updated succesfully')    
finally:
    if f:
        f.close()

File's been updated succesfully


In [8]:
# смысл конструкции with - более короткая запись кода:
#
# -- setup --
# try:
#     -- do_smth --
# finally:
#     -- tear down --
#

# чтение строк из файла (с конструкцией with)
try:
    with open(r'data\demo.txt', 'rt') as f:
        lines = f.readlines()
except IOError as err:
    print(err)
    lines = []

# вывести строки из файла в "сыром" виде
print('Read from file:', lines)


# 1 - убрать \n на конце каждой строки с помощью rstrip и мар
newlines = list(map(lambda x: x.rstrip(), lines))
# 2 - убрать \n на конце каждой строки с помощью rstrip и list comprehension
newlines = [x.rstrip() for x in lines]

print('After post-processing:', newlines)

Read from file: ['Hello World\n', '(c) Admin\n']
After post-processing: ['Hello World', '(c) Admin']


In [9]:
file = open(r'data\demo.txt', 'r+')

file.seek(6)
word = file.read(9);
print(word)

pos = file.tell();
print('Current file position:', pos)

file.close()

World
(c)
Current file position: 16


#### Работа с файловой системой и ОС

In [10]:
import os

os.getcwd()

'C:\\Users\\Tim\\Documents\\Python Scripts'

In [11]:
# забавно создаем пустой файл:
open(r'data\ghost.txt', 'wt').close()

# переименуем
os.rename(r'data\ghost.txt', r'data\to_delete.txt')

# да и удалим
os.remove(r'data\to_delete.txt')

Модуль **send2trash** отвечает за безопасное удаление файлов:

```
import send2trash

send2trash.send2trash('data\demo.txt')
```

In [12]:
os.path.exists('What.txt')

False

In [13]:
os.path.isdir('data')

True

In [14]:
os.path.isfile('Useful Resources.ipynb')

True

In [15]:
os.path.join('D:\\', 'Program Files', 'Projects')

'D:\\Program Files\\Projects'

In [16]:
os.path.basename(os.getcwd())

'Python Scripts'

In [17]:
os.path.dirname(os.getcwd())

'C:\\Users\\Tim\\Documents'

In [18]:
'{} bytes'.format(os.path.getsize('Useful Resources.ipynb'))

'28801 bytes'

Еще часть функций из модуля os:
- os.listdir
- os.mkdir
- os.makedirs
- os.system - устарело, используется subprocess (см. следующую лекцию)

In [19]:
os.listdir(os.getcwd())

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'algo',
 'data',
 'lec01 - Intro.ipynb',
 'lec02 - Data types.ipynb',
 'lec03 - Modules and packages.ipynb',
 'lec04 - Declarative Python, CSharp, Java.pdf',
 'lec04 - Elements of functional programming.ipynb',
 'lec05 - Working with Files.ipynb',
 'lec06 - Processes and IPC.ipynb',
 'README.md',
 'Useful Resources.ipynb']

In [20]:
# получить список только папок:
folders = [entry for entry in os.listdir(os.getcwd())
                 if os.path.isdir(entry)]
folders

['.git', '.ipynb_checkpoints', 'algo', 'data']

In [21]:
COPY_DIR = r'D:\PythonCourse'

files = os.listdir(os.getcwd())
file_copies = [os.path.join(COPY_DIR, f) for f in files]
file_copies

['D:\\PythonCourse\\.git',
 'D:\\PythonCourse\\.gitignore',
 'D:\\PythonCourse\\.ipynb_checkpoints',
 'D:\\PythonCourse\\algo',
 'D:\\PythonCourse\\data',
 'D:\\PythonCourse\\lec01 - Intro.ipynb',
 'D:\\PythonCourse\\lec02 - Data types.ipynb',
 'D:\\PythonCourse\\lec03 - Modules and packages.ipynb',
 'D:\\PythonCourse\\lec04 - Declarative Python, CSharp, Java.pdf',
 'D:\\PythonCourse\\lec04 - Elements of functional programming.ipynb',
 'D:\\PythonCourse\\lec05 - Working with Files.ipynb',
 'D:\\PythonCourse\\lec06 - Processes and IPC.ipynb',
 'D:\\PythonCourse\\README.md',
 'D:\\PythonCourse\\Useful Resources.ipynb']

In [22]:
try:
    os.mkdir('test1')
    os.makedirs(r'test2\we_need\to_go_deeper')
except Exception as err:
    print(err)

In [23]:
try:
    os.rmdir('test1')
    os.rmdir(r'test2\we_need\to_go_deeper')
    # shutil.rmtree('test2')
except Exception as err:
    print(err)

In [24]:
folder = os.getcwd()

# итератор, рекурсивно проходящий по директории
dir_iterator = os.walk(folder)

for i in range(4):
    print('Iteration {}:\n'.format(i+1))
    
    root, dirs, files = next(dir_iterator)

    # исключим служебные папки, начинающиеся с точки
    dirs[:] = [d for d in dirs if not d[0] == '.']
    
    print('Root:', root)
    print('Subfolders:', dirs)
    print('Files:', files)
    print()

Iteration 1:

Root: C:\Users\Tim\Documents\Python Scripts
Subfolders: ['algo', 'data', 'test2']
Files: ['.gitignore', 'lec01 - Intro.ipynb', 'lec02 - Data types.ipynb', 'lec03 - Modules and packages.ipynb', 'lec04 - Declarative Python, CSharp, Java.pdf', 'lec04 - Elements of functional programming.ipynb', 'lec05 - Working with Files.ipynb', 'lec06 - Processes and IPC.ipynb', 'README.md', 'Useful Resources.ipynb']

Iteration 2:

Root: C:\Users\Tim\Documents\Python Scripts\algo
Subfolders: ['__pycache__']
Files: ['search.py', 'sort.py', '__init__.py']

Iteration 3:

Root: C:\Users\Tim\Documents\Python Scripts\algo\__pycache__
Subfolders: []
Files: ['search.cpython-35.pyc', '__init__.cpython-35.pyc']

Iteration 4:

Root: C:\Users\Tim\Documents\Python Scripts\data
Subfolders: []
Files: ['data.json', 'data.pkl', 'demo.txt', 'demo.xml', 'democopy.txt', 'temp.txt']



In [25]:
# модуль с even more функциями по работе с файлами и ОС
import shutil

shutil.copy(r'data\demo.txt', r'data\democopy.txt')

dir(shutil)

['Error',
 'ExecError',
 'ReadError',
 'RegistryError',
 'SameFileError',
 'SpecialFileError',
 '_ARCHIVE_FORMATS',
 '_BZ2_SUPPORTED',
 '_LZMA_SUPPORTED',
 '_UNPACK_FORMATS',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_basename',
 '_check_unpack_options',
 '_copyxattr',
 '_destinsrc',
 '_ensure_directory',
 '_find_unpack_format',
 '_get_gid',
 '_get_uid',
 '_make_tarball',
 '_make_zipfile',
 '_ntuple_diskusage',
 '_rmtree_safe_fd',
 '_rmtree_unsafe',
 '_samefile',
 '_unpack_tarfile',
 '_unpack_zipfile',
 '_use_fd_functions',
 'chown',
 'collections',
 'copy',
 'copy2',
 'copyfile',
 'copyfileobj',
 'copymode',
 'copystat',
 'copytree',
 'disk_usage',
 'errno',
 'fnmatch',
 'get_archive_formats',
 'get_terminal_size',
 'get_unpack_formats',
 'getgrnam',
 'getpwnam',
 'ignore_patterns',
 'make_archive',
 'move',
 'nt',
 'os',
 'register_archive_format',
 'register_unpack_format',
 'rmtree',
 'stat',
 'sy

In [26]:
shutil.disk_usage('G:')

usage(total=209711706112, used=153112027136, free=56599678976)

In [27]:
def formatted(s, sep=','):  
    if len(s) <= 3: return s  
    return formatted(s[:-3], sep) + sep + s[-3:]


total, used, free = shutil.disk_usage('G:')

print('Total bytes:', formatted(str(total)))
print('Used bytes:', formatted(str(used)))
print('Free bytes:', formatted(str(free)))

Total bytes: 209,711,706,112
Used bytes: 153,112,027,136
Free bytes: 56,599,678,976


In [28]:
# не шибко разнообразный модуль для поиска
# файлов и директорий по unix-фильтрам
import glob

dir(glob)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_iglob',
 '_ishidden',
 '_isrecursive',
 '_rlistdir',
 'escape',
 'fnmatch',
 'glob',
 'glob0',
 'glob1',
 'glob2',
 'has_magic',
 'iglob',
 'magic_check',
 'magic_check_bytes',
 'os',
 're']

In [29]:
glob.glob('*.ipynb')

# еще варианты:
# glob.glob('?.ipynb')
# glob.glob('**/*.ipynb', recursive=True)
# glob.glob('./**/', recursive=True)

['lec01 - Intro.ipynb',
 'lec02 - Data types.ipynb',
 'lec03 - Modules and packages.ipynb',
 'lec04 - Elements of functional programming.ipynb',
 'lec05 - Working with Files.ipynb',
 'lec06 - Processes and IPC.ipynb',
 'Useful Resources.ipynb']

In [30]:
glob.glob('lec[0-9]*.*')

['lec01 - Intro.ipynb',
 'lec02 - Data types.ipynb',
 'lec03 - Modules and packages.ipynb',
 'lec04 - Declarative Python, CSharp, Java.pdf',
 'lec04 - Elements of functional programming.ipynb',
 'lec05 - Working with Files.ipynb',
 'lec06 - Processes and IPC.ipynb']

In [31]:
# еще модуль для работы с путями файловой системы
# (в ООП-стиле, для разных платформ)
import pathlib

p = pathlib.PureWindowsPath(r'D:\Program Files\Python')
p.parts

('D:\\', 'Program Files', 'Python')

In [32]:
p = pathlib.PurePath('/usr/bin/executable')
p.parts

('\\', 'usr', 'bin', 'executable')

In [33]:
# того же результата можно достичь обычным сплитом:
path = r'D:\Program Files\Python'
path.split(os.path.sep)

['D:', 'Program Files', 'Python']

В pathlib присутствует весь функционал os.path и более того.

Подробнее см. здесь:

https://docs.python.org/3/library/pathlib.html#module-pathlib

#### Сериализация / десериализация

In [34]:
# модуль pickle (сериализация / десериализация любых объектов)
import pickle 
    
data = (1, 3.15, 'Hello', [1, 4, 5])

# сериализация
pickle.dump(data, open(r'data\data.pkl', 'wb'))

# десериализация
obj = pickle.load(open(r'data\data.pkl', 'rb'))
print(obj)

(1, 3.15, 'Hello', [1, 4, 5])


In [35]:
import simplejson

# json-сериализация
with open(r'data\data.json', 'w') as f:
    simplejson.dump(data, f, indent=4)

# json-десериализация
json = simplejson.load(open(r'data\data.json', 'r'))
print(json)

[1, 3.15, 'Hello', [1, 4, 5]]


In [36]:
with open(r'data\data.json', 'r') as f:
    for line in f:
        print(line.rstrip())

[
    1,
    3.15,
    "Hello",
    [
        1,
        4,
        5
    ]
]


In [37]:
# XML
# Пример взят из документации:
# https://docs.python.org/3.5/library/xml.etree.elementtree.html

xml = '''<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>
'''

xmlfile = open(r'data\demo.xml', 'w')
xmlfile.write(xml)
xmlfile.close()

In [38]:
import xml.etree.ElementTree as ET

tree = ET.parse(r'data\demo.xml')
root = tree.getroot()
root.tag

'data'

In [39]:
for child in root:
    print(child.tag, child.attrib)

country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


In [40]:
# демонстрация адресации XPath

root.findall(".")

[<Element 'data' at 0x02CBB5A0>]

In [41]:
neighbors = root.findall('./country/neighbor')
for neighbor in neighbors:
    print(neighbor.get('name'))

Austria
Switzerland
Malaysia
Costa Rica
Colombia


In [42]:
root.findall(".//year/..[@name='Singapore']")

[<Element 'country' at 0x02D77D80>]

In [43]:
print(root.findall(".//*[@name='Singapore']/year")[0].text)

2011


In [44]:
second_neighbors = root.findall(".//neighbor[2]")
for neighbor in second_neighbors:
    print(neighbor.get('name'))

Switzerland
Colombia


In [45]:
# Еще модуль с DOM-парсером:
import xml.dom.minidom


DOMTree = xml.dom.minidom.parse(r'data\demo.xml')
collection = DOMTree.documentElement
print('Root element: {}'.format(collection.tagName))

Root element: data


In [46]:
countries = collection.getElementsByTagName("country")

for country in countries:
    print("Country: {}".format(country.getAttribute('name')))

    print('Neighbors:')
    neighbors = country.getElementsByTagName('neighbor')
    for neighbor in neighbors:
        print('\t', neighbor.getAttribute('name'))
   

Country: Liechtenstein
Neighbors:
	 Austria
	 Switzerland
Country: Singapore
Neighbors:
	 Malaysia
Country: Panama
Neighbors:
	 Costa Rica
	 Colombia


In [47]:
# ZIP-архивация
# Подробнее здесь: https://pymotw.com/3/zipfile/
import zipfile
import datetime

files = [r'data\data.json', r'data\data.pkl', r'data\demo.txt']

print('crating arhive...')

archive = zipfile.ZipFile(r'data\archive.zip', mode='w')
try:
    for file in files:
        print('adding {}'.format(file))
        archive.write(file)
finally:
    print('closing...\n')
    archive.close()


for info in archive.infolist():
    print(info.filename)
    print('\tComment:\t', info.comment)
    print('\tModified:\t', datetime.datetime(*info.date_time))
    print('\tSystem:\t\t', info.create_system, '(0 = Windows, 3 = Unix)')
    print('\tZIP version:\t', info.create_version)
    print('\tCompressed:\t', info.compress_size, 'bytes')
    print('\tUncompressed:\t\n', info.file_size, 'bytes')

crating arhive...
adding data\data.json
adding data\data.pkl
adding data\demo.txt
closing...

data/data.json
	Comment:	 b''
	Modified:	 2016-09-13 12:39:04
	System:		 0 (0 = Windows, 3 = Unix)
	ZIP version:	 20
	Compressed:	 86 bytes
	Uncompressed:	
 86 bytes
data/data.pkl
	Comment:	 b''
	Modified:	 2016-09-13 12:39:03
	System:		 0 (0 = Windows, 3 = Unix)
	ZIP version:	 20
	Compressed:	 41 bytes
	Uncompressed:	
 41 bytes
data/demo.txt
	Comment:	 b''
	Modified:	 2016-09-13 12:39:01
	System:		 0 (0 = Windows, 3 = Unix)
	ZIP version:	 20
	Compressed:	 24 bytes
	Uncompressed:	
 24 bytes
