# <font color=#FF1F58> Python Basics (Part III): Working with files </font>

In [None]:
## This code imports folders and files so that they are available for us to work with
!git clone https://github.com/aralittle/intro_to_python.git

In [None]:
## do not run this, just for cleanup after demo
!rm -rf first/
!rm -rf intro_to_python/

### Opening & writing files

In [None]:
## recommended option to avoid having too many files opened and losing data
with open('intro_to_python/data/data_to_read.txt', 'r') as f:
    data = f.read()
    print(data)

print('hello')

In [None]:
## NOT recommended option
f = open('intro_to_python/data/data_to_read.txt', 'r')
data = f.read()
print(data)
f.close()

In [None]:
## write to a file
with open('intro_to_python/data/data.txt', 'w') as f:
  f.write('Hello world!')
with open('intro_to_python/data/data.txt', 'r') as f:
    data = f.read()
    print(data)

In [None]:
## append to a file
with open('intro_to_python/data/data.txt', 'a') as corpus:
  corpus.write('This is an appended line')

In [None]:
with open('intro_to_python/data/data.txt', 'r') as f:
    data = f.read()
    print(data)

#### Specify encoding
Find the encoding list here: https://docs.python.org/3.6/library/codecs.html#standard-encodings

In [None]:
with open('intro_to_python/data/data_latin1.txt', 'r', encoding="latin1") as f:
    data = f.read()
    print(data)


In [None]:
# if we open it with the wrong encoding it will fail (utf-8 by default in python 3)
with open('intro_to_python/data/data_latin1.txt', 'r') as f:
    data = f.read()
    print(data)

#### Iterate though contents

In [None]:
# .read() returns a string containing the whole file
with open('intro_to_python/data/data_to_read.txt', 'r') as f:
    data = f.read()
print(data)
print(type(data))

In [None]:
# .readlines() returns a string containing the whole file
with open('intro_to_python/data/data_to_read.txt', 'r') as f:
    data = f.readlines()
print(data)
print(type(data))
for line in data:
  print(f"working with line: {line}")

In [None]:
# Practice:
# here is our document:
document = 'intro_to_python/data/exercises.txt'
# 0. Open the document using ".read()" and save the contents to "data"
with open('intro_to_python/data/exercises.txt', 'r') as f:
    data = f.read()
print(data)
# 1. Open the document using ".readlines()" and save the contents to "datalines"
with open('intro_to_python/data/exercises.txt', 'r') as f:
    datalines = f.readlines()
print(datalines)

In [None]:
# 2. How many times does <unk> appear in the document?
#    Hint1: if you want to search in the whole document use the "data" variable that you just created
#    Hint2: You can use the "count" method (check notebook 2 or https://www.programiz.com/python-programming/methods/string/count)


# 3. Where is the <unk> sequence present for the first time?
#    Hint1: if you want to search in the whole document use the "data" variable that you just created
#    Hint2: You can use the "find" method (check notebook 2 or https://www.programiz.com/python-programming/methods/string/find)



# 4. Remove all the <unk> in the sentence
#    Hint1: if you want to search in the whole document use the "data" variable that you just created
#    Hint2: You can use the "replace" method (check notebook 2 or https://www.programiz.com/python-programming/methods/string/replace)


# 5. Remove the possible (extra) newlines, spaces, \r characters... at the end of each sentence
#    Hint1: in this case you need to go line by line, use the "datalines" variable that you just created
#    Hint2: You can use the "rstrip" method (check notebook 2 or https://www.programiz.com/python-programming/methods/string/rstrip)
#    Hint3: Use a for loop (we saw it before)


## !! remember that these operations wont change the string unless you save it using another variable
# 6. Remove all the <unk> in the sentence, using the data variable, this time save the result to another variable


### Working with directories

In [None]:
# list contents of a directory
import os
entries = os.listdir('intro_to_python/data')
print(entries)

In [None]:
# create directories and directory trees
os.makedirs('first/second/third')

#### Transversing directories

In [None]:
# os.walk() returns three values on each iteration of the loop:
# - The name of the current folder
# - A list of folders in the current folder
# - A list of files in the current folder
# official documentation: https://docs.python.org/3/library/os.html#os.walk
for dirpath, dirnames, files in os.walk('intro_to_python/data'):
    print(f'I am in: {dirpath}')
    print(f'Directores at my current location: {dirnames}')
    for file_name in files:
        print(file_name)

In [None]:
# process all of these files using "os.path.join"
for dirpath, dirnames, files in os.walk('intro_to_python/data'):
    for file_name in files:
      if not 'latin1.txt' in file_name:
        with open(os.path.join(dirpath, file_name), "r") as f:
          contents = f.read()
          new_contents = contents.upper()
          print(f"--> Processing {file_name} :")
          print(new_contents)


In [None]:
# Practice: you have your data in data_latin1.txt somewhere within "intro_to_python/data" and you want to process
# it (do some modification: remove characters, change to upper case, etc.)
# and save it all to another file
