In [1]:
# The following is to know when this notebook has been run and with which python version.
import time, sys
print(time.ctime())
print(sys.version.split('|')[0])

Mon May  7 08:28:39 2018
3.6.4 


# How to read and write files (Text)

Adapted from Python lecture given by Christophe Morisset at IA-UNAM.

Some informations are here: http://www.tutorialspoint.com/python/python_files_io.htm

## Reading a simple ascii file

First of all, we will have to have some files on the hard drive to read them The following notebook cell will write a file in the same directory where the notebook has been started.

In [2]:
%%writefile data1.dat
January
February
March
April
May
June
July
August
September
October
November
December

Writing data1.dat


Now the goal is to read this file. The first way is to open the file, read it completely in a variable and close the file. Then we can play with the content of the file.

In [3]:
datafile = open('data1.dat', 'r') # Open the file to read it

In [4]:
data = datafile.readlines() # The variable data will receive the content of the file.

In [5]:
datafile.close() # Not need anymore of the file.

In [6]:
print(type(data)) # The data file is stored in the form of a list, each element of the list corresponding to a row of the list.

<class 'list'>


In [7]:
print(data) # Each row is a string and terminates with \n, symbol of END OF LINE.

['January\n', 'February\n', 'March\n', 'April\n', 'May\n', 'June\n', 'July\n', 'August\n', 'September\n', 'October\n', 'November\n', 'December']


In [8]:
print(len(data)) # number of rows

12


In [9]:
print(data[0], 'tralala')

January
 tralala


In [10]:
for row in data:
    print(row)

January

February

March

April

May

June

July

August

September

October

November

December


In [11]:
# In python :
for row in data:
    print(row),

January

February

March

April

May

June

July

August

September

October

November

December


In [12]:
# In python 3:
for row in data:
    print(row, end='')

January
February
March
April
May
June
July
August
September
October
November
December

In [21]:
print(type(data[0])) # Each element is a string

<class 'str'>


Now it is easy to separate each field with the split command: 

In [22]:
for row in data:
    print(row.split())

['January']
['February']
['March']
['April']
['May']
['June']
['July']
['August']
['September']
['October']
['November']
['December']


## How to treat special rows (headers, comments)

In [24]:
%%writefile data2.dat
# The following data are for test purpose
N    f   x   y type
1   2.3  6   8 star
2   3.5  7   9 galaxy
3  -4.2  5   7 cluster
#4  -10.5  5  7 test

Overwriting data2.dat


In [25]:
!cat data2.dat # Just to check that the # comments are also in the file

# The following data are for test purpose
N    f   x   y type
1   2.3  6   8 star
2   3.5  7   9 galaxy
3  -4.2  5   7 cluster
#4  -10.5  5  7 test

The file has to be read row by row, to be sure that special cases are treated.

In [26]:
datafile = open('data2.dat', 'r') # Open the file to read it

row = datafile.readline() # this reads only one line
first_comment = row
print(first_comment, end='')

row = datafile.readline() # this reads only one line
header = row
print(header, end='')

data = []
while True: # loops until exit by break command
    row = datafile.readline()
    if row == '':
        break
    if row[0] != '#' and row[0] != '\n': # comment lines are skipped
        data.append(row)
datafile.close()
print(data)

# The following data are for test purpose
N    f   x   y type
['1   2.3  6   8 star\n', '2   3.5  7   9 galaxy\n', '3  -4.2  5   7 cluster\n']


In [27]:
datafile = open('data2.dat', 'r') # Open the file to read it
row = datafile.readline() # this reads only one line
first_comment = row
print(first_comment, end='')
row = datafile.readline() # this reads only one line
header = row
print(header, end='')
data = []
row = datafile.readline()
while row != '': # loops until exit by break command
    if row[0] != '#': # comment lines are skipped
        data.append(row)
    row = datafile.readline()
datafile.close()
print(data)

# The following data are for test purpose
N    f   x   y type
['1   2.3  6   8 star\n', '2   3.5  7   9 galaxy\n', '3  -4.2  5   7 cluster\n']


In [28]:
# very shorter way to deal with the file. No need to look for the end of the file.
datafile = open('data2.dat', 'r') # Open the file to read it
data = []
for row in datafile:
    if row[0] != '#': # comment lines are skipped
        data.append(row)  
datafile.close()
print(data)
# This way will include the header in the data... Not what we want

['N    f   x   y type\n', '1   2.3  6   8 star\n', '2   3.5  7   9 galaxy\n', '3  -4.2  5   7 cluster\n']


In [29]:
# very shorter way to deal with the file:
# we know that the header is the first no-comment line in the file.
datafile = open('data2.dat', 'r') # Open the file to read it
data = []
comments = [] # we can keep the comments for some usage
header_read = False # We will turn it to True once the header is read
for row in datafile:
    if row[0] != '#': # comment lines are skipped
        if not header_read:
            header = row
            header_read = True # next time, data will be read
        else:
            data.append(row)
    else:
        comments.append(row)
datafile.close()
print(header, end='')
print(data)
print(comments)

N    f   x   y type
['1   2.3  6   8 star\n', '2   3.5  7   9 galaxy\n', '3  -4.2  5   7 cluster\n']
['# The following data are for test purpose\n', '#4  -10.5  5  7 test']


In [30]:
# Alternative way using "with". No need to close the file, done when the "with" block is terminated.
data = []
comments = []
header_read = False
def change_type(row_split):
    # This function change the type of the data read from the file from 5 strings into int, 3 floats and a string
    # It also return the result in form of a tuple
    return (int(row_split[0]), 
            float(row_split[1]), 
            float(row_split[2]), 
            float(row_split[3]), 
            row_split[4])
with open('data2.dat', 'r') as datafile:
    for row in datafile:
        if row[0] != '#' and row[0] != '\n': # comment lines are skipped
            if not header_read:
                header = row
                header_read = True
            else:
                data.append(change_type(row.split()))
        else:
            comments.append(row)
print(header)
print(data)
print(comments)

N    f   x   y type

[(1, 2.3, 6.0, 8.0, 'star'), (2, 3.5, 7.0, 9.0, 'galaxy'), (3, -4.2, 5.0, 7.0, 'cluster')]
['# The following data are for test purpose\n', '#4  -10.5  5  7 test']


## Writing files

### Simple "write" method from "open" class

In [33]:
f = open('data10.dat', 'w')

In [34]:
f.write('tralala')
f.write('trololo')

7

In [35]:
f.close()

In [36]:
!cat 'data10.dat' # the writing method put everything together.

tralalatrololo

In [37]:
f = open('data11.dat', 'w')
f.write('tralala\n') # \n to indicate end of line
f.write('trololo\n')
f.close()
!cat 'data11.dat'

tralala
trololo


In [38]:
f = open('data11.dat', 'a') # Append to the edn of the file
f.write('trilili\n') # \n to indicate end of line
f.write('trululu\n')
f.close()
!cat 'data11.dat'

tralala
trololo
trilili
trululu


In [39]:
a = 'Smith'
b = 3
with open('data12.dat', 'w') as datafile:
    datafile.write("""Hola Sr. {0}
This is a file
with a lot of lines.
It is easy to write it.
The value of your data is {1}.
""".format(a, b))
!cat "data12.dat"

Hola Sr. Smith
This is a file
with a lot of lines.
It is easy to write it.
The value of your data is 3.


### Using pickle (and cpickle) python specific format

In [41]:
import numpy as np
# Let's define some stuffs we want to keep in a file (data and variable names)
a = 5
b = 'Hola'
c = np.array([1,2,3,4,5])
def d(x):
    """ Function mia"""
    return x**2

In [42]:
import pickle # The module we will use for this

In [43]:
pickle.dump((a,b,c,d), open('Demo.pickle','wb')) # Writing the variables

In [44]:
res = pickle.load(open('Demo.pickle', 'rb'))

In [45]:
type(res)

tuple

In [46]:
print(res[0])
print(res[1])
print(res[2])

5
Hola
[1 2 3 4 5]


In [47]:
res[3](5)

25

In [48]:
a2,b2,c2,d2 = pickle.load(open('Demo.pickle', 'rb'))

In [49]:
a2

5

In [50]:
d2(10)

100

In [51]:
help(d2)

Help on function d in module __main__:

d(x)
    Function mia



In [52]:
%timeit res = pickle.load(open('Demo.pickle', 'rb'))

36.3 µs ± 2.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [53]:
import gzip
pickle.dump((a,b,c,d), gzip.open('Demo.pklz','wb')) # Writing the variables

In [54]:
f = gzip.open('Demo.pklz','rb')
a, b, c, d = pickle.load(f)
f.close()