# Chapter 2. Automating Files and the Filesystem


## Reading and Writing Files

In [26]:
file_path = 'print_time.py'
open_file = open(file_path, 'r')
text = open_file.read()
len(text)

FileNotFoundError: [Errno 2] No such file or directory: 'print_time.py'

In [None]:
text[56]

't'

In [None]:
open_file

<_io.TextIOWrapper name='print_time.py' mode='r' encoding='UTF-8'>

In [None]:
open_file.close()

In [None]:
# readlines
file_path = 'print_time.py'
open_file = open(file_path, 'r')
text = open_file.readlines()
len(text)

3

In [None]:
text[2]

'print(datetime.datetime.now().time())\n'

In [None]:
open_file.close()

In [None]:
# A handy way of opening files is to use with statements. 
# You do not need to close a file explicitly in this case.
with open(file_path, 'r') as open_file: 
    text = open_file.readlines()
  
print(text[1])
open_file.closed


import datetime 



True

In [None]:
file_path = '20221025_GID_Storage.pptx'
with open(file_path, 'rb') as open_file: 
    btext = open_file.read()

btext[1]

75

In [None]:
# read binary files by appending a b to mode:
btext[:25]

b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\x00\xe65\xf2\x04\x8c\x04\x00\x00\x9cP\x00'

In [None]:
# Writing to a file
text = '''export STAGE=PROD
export TABLE_ID=token-storage-1234'''
with open('.envrc', 'w') as open_file:
    open_file.write(text)

!cat .envrc

export STAGE=PROD
export TABLE_ID=token-storage-1234

In [None]:
# pathlib handles the file object behind the scenes. The following allows you to read text from a file
import pathlib

path = pathlib.Path('/Users/jin/Documents/Lab/BOOKS/python-for-devops/print_time.py')

path.read_text()

'#!/usr/bin/env python\nimport datetime \nprint(datetime.datetime.now().time())\n'

## Handling IAM policy (json)

In [None]:
with open('./sample/service_policy.json', 'r') as opened_file: 
    policy = opened_file.readlines()

print(policy)

['{\n', '    "Version": "2012-10-17",\n', '    "Statement": {\n', '        "Effect": "Allow",\n', '        "Action": "service-prefix:action-name",\n', '        "Resource": "*",\n', '        "Condition": {\n', '            "DateGreaterThan": {"aws:CurrentTime": "2017-07-01T00:00:00Z"},\n', '            "DateLessThan": {"aws:CurrentTime": "2017-12-31T23:59:59Z"}\n', '        }\n', '    }\n', '}']


In [None]:
# Properly read json using json and pprint packages
import json
with open('./sample/service_policy.json', 'r') as opened_file:
    policy  = json.load(opened_file)

In [None]:
from pprint import pprint
pprint(policy)

{'Statement': {'Action': 'service-prefix:action-name',
               'Condition': {'DateGreaterThan': {'aws:CurrentTime': '2017-07-01T00:00:00Z'},
                             'DateLessThan': {'aws:CurrentTime': '2017-12-31T23:59:59Z'}},
               'Effect': 'Allow',
               'Resource': '*'},
 'Version': '2012-10-17'}


In [None]:
policy['Statement']['Resource'] = 'S3'
pprint(policy)

{'Statement': {'Action': 'service-prefix:action-name',
               'Condition': {'DateGreaterThan': {'aws:CurrentTime': '2017-07-01T00:00:00Z'},
                             'DateLessThan': {'aws:CurrentTime': '2017-12-31T23:59:59Z'}},
               'Effect': 'Allow',
               'Resource': 'S3'},
 'Version': '2012-10-17'}


In [None]:
# write a Python dictionary as a JSON file by using the json.dump method
with open('./sample/service_policy.json', 'w') as opened_file:
    policy = json.dump(policy, opened_file)

pprint(policy)

None


## Handling YAML

In [None]:
import yaml 

with open('./sample/ansible.yaml', 'r') as opened_file: 
    verify_apache = yaml.safe_load(opened_file)

# The data loads as familiar Python data structures (a list containing a dict):
pprint(verify_apache)

[{'hosts': 'webservers',
  'remote_user': 'root',
  'tasks': [{'name': 'ensure apache is at the latest version',
             'yum': {'name': 'httpd', 'state': 'latest'}}],
  'vars': {'http_port': 80, 'max_clients': 200}}]


## Handling XML

In [30]:
import xml.etree.ElementTree as ET
tree = ET.parse('./sample/library.xml')

root = tree.getroot()
root 

<Element 'library' at 0x107f6f420>

In [31]:
for child in root:
    print(child.tag, child.attrib)

book {}
book {}
book {}


## Handling CSV

In [32]:
import csv 

file_path = './sample/test.csv'

with open(file_path, newline='') as csv_file :
    off_reader = csv.reader(csv_file, delimiter=',')
    for _ in range(5):
        print(next(off_reader))

['Name', 'Age', 'Location', 'Occupation']
['John Smith', '32', 'New York', 'Engineer']
['Jane Doe', '28', 'Los Angeles', 'Designer']
['Michael Johnson', '45', 'Chicago', 'Teacher']
['Emily Williams', '22', 'San Francisco', 'Student']


In [34]:
import pandas as pd

df = pd.read_csv('./sample/test.csv')
type(df)

pandas.core.frame.DataFrame

In [35]:
df.head(3)

Unnamed: 0,Name,Age,Location,Occupation
0,John Smith,32,New York,Engineer
1,Jane Doe,28,Los Angeles,Designer
2,Michael Johnson,45,Chicago,Teacher


In [37]:
df.describe()

Unnamed: 0,Age
count,10.0
mean,34.6
std,8.946756
min,22.0
25%,28.25
50%,33.5
75%,40.75
max,50.0


In [38]:
df['Location']

0         New York
1      Los Angeles
2          Chicago
3    San Francisco
4            Miami
5           Boston
6          Houston
7          Seattle
8          Atlanta
9           Dallas
Name: Location, dtype: object

## Using Regular Expressions to Search Text

In [12]:
# Let's find an ip address
import re

line = '127.0.0.1 - rj [13/Nov/2019:14:34:30 -0000] "GET HTTP/1.0" 200'

m = re.search(r'(?P<IP>\d+\.\d+\.\d+\.\d+)', line)

m.group('IP')

'127.0.0.1'

In [28]:
# You can also create a regular expression to get the time:
r = r'\[(?P<Time>\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2})'
m = re.search(r, line)

print(m.group('Time'))

13/Nov/2019:14:34:30


Parsing a single line of a log is interesting but not terribly useful. However, you can use this regular expression as a basis for designing one to pull information from the whole log. Let’s say you want to pull all of the IP addresses for GET requests that happened on November 8, 2019. 

In [48]:
apache_log = '''
127.0.0.1 - jinhwan [03/Aug/2023:10:15:25 +0000] "GET /index.html HTTP/1.1" 200 3548
192.168.1.1 - jeongeuun [03/Aug/2023:10:20:12 +0000] "GET /images/logo.png HTTP/1.1" 304 0
10.0.0.2 - jinhwan [03/Aug/2023:10:25:41 +0000] "POST /submit_form HTTP/1.1" 302 220
172.16.0.1 - leah [03/Aug/2023:10:30:18 +0000] "GET /about.html HTTP/1.1" 200 1805
192.168.2.2 - jinhwan [03/Aug/2023:10:35:57 +0000] "GET /styles/main.css HTTP/1.1" 200 1124
'''

r = r'(?P<IP>\d+\.\d+\.\d+\.\d+) '
r += r'- (?P<User>\w+)'
r += r' \[(?P<Time>03/Aug/\d{4}:\d{2}:\d{2}:\d{2} [-+]\d{4})\]'
r += r' (?P<Request>"GET .+")'

In [50]:
matched = re.finditer(r, apache_log)
print(matched)
for m in matched:
    print(m.group('IP'))

<callable_iterator object at 0x10777d150>
127.0.0.1
192.168.1.1
172.16.0.1
192.168.2.2


## Dealing with Large Files

In [51]:
with open('./sample/apache.log', 'r') as source_file: 
    with open('./sample/apache_log_new.log', 'w') as target_file:
        for line in source_file:
            target_file.write(line)


In [52]:
!cat ./sample/apache_log_new.log

127.0.0.1 - jinhwan [03/Aug/2023:10:15:25 +0000] "GET /index.html HTTP/1.1" 200 3548
192.168.1.1 - jeongeuun [03/Aug/2023:10:20:12 +0000] "GET /images/logo.png HTTP/1.1" 304 0
10.0.0.2 - jinhwan [03/Aug/2023:10:25:41 +0000] "POST /submit_form HTTP/1.1" 302 220
172.16.0.1 - leah [03/Aug/2023:10:30:18 +0000] "GET /about.html HTTP/1.1" 200 1805
192.168.2.2 - jinhwan [03/Aug/2023:10:35:57 +0000] "GET /styles/main.css HTTP/1.1" 200 1124


## Encrypting Text
In addition to Python’s built-in package `hashlib`, there is a widely used third-party package called `cryptography`

In [53]:
import hashlib

secret = "This is the password or document text"

# if your password or document is a string, you need to turn it into a binary string by using the encode method
bsecret = secret.encode()

m = hashlib.md5()

m.update(bsecret)

m.digest()

b' \xf5\x06\xe6\xfc\x1c\xbe\x86\xddj\x96C\x10\x0f5E'

### Encryption with Cryptography

In [54]:

!pip install cryptography



In [56]:
# import Fernet for symmetric encryption
from cryptography.fernet import Fernet

key = Fernet.generate_key()
key

b'ykCaMAmEhjlKHtzYsmFztEIt-zUgTITyBD7Brj6emmc='

In [57]:
f = Fernet(key)

message = b'I\'m going home on Thursday'
encrypted = f.encrypt(message)

encrypted

b'gAAAAABky3dInmxb1Fi8n_dz1LIWKEL8pwWAVvgyHMpE0lDE0NyU0ByNqVKDI0pWraPrFRhlBOw0qEEnVM726Wzoln4AIJaOZ5Yh94MWh0PjGEY_Op8EF-A='

In [58]:
# You can decrypt the data using a Fernet object created with the same key
f = Fernet(key)
decrypted = f.decrypt(encrypted)
decrypted

b"I'm going home on Thursday"

In [60]:
# Asymmetric key encryption uses a pair of keys, one public and one private. 
# The public key is designed to be widely shared, while a single user holds the private one. 
# The only way you can decrypt messages that have been encrypted using your public 
# key is by using your private key
from cryptography.hazmat.backends import default_backend
# One very popular asymmetric key algorithm is Rivest-Shamir-Adleman (RSA)
from cryptography.hazmat.primitives.asymmetric import rsa

private_key = rsa.generate_private_key(public_exponent=65537,
                                       key_size=4096,
                                       backend=default_backend())

private_key

<cryptography.hazmat.backends.openssl.rsa._RSAPrivateKey at 0x107d2a8f0>

In [62]:
public_key = private_key.public_key()
public_key

<cryptography.hazmat.backends.openssl.rsa._RSAPublicKey at 0x106a491e0>

In [None]:
message = b"More secrets go here"
from cryptography.hazmat.primitives.asymmetric import padding
from cryptography.hazmat.primitives import hashes

encrypted = public_key.encrypt(message, 
                               padding=padding.OAEP(mgf=padding.MGF1(algorithm=hashes.SHA256()),
                                                    algorithm=hashes.SHA256(), 
                                                    label=None))

# Padding in encryption refers to adding extra bits or bytes to the plaintext message before encryption 
# in order to ensure that the message meets specific length or formatting requirements imposed by the 
# encryption algorithm.