# The `os` module
"This module provides a portable way of using operating system dependent functionality" such as setting the current working directory and other methods for accessing the filesystem.

https://docs.python.org/3/library/os.html

In [1]:
import os
# get current working directory
os.getcwd()

'/home/aditya/trainings/handling_data_in_python'

In [3]:
os.chdir('img')  # change the working directory
os.listdir()  # execute the 'ls' command: list files and folders

['python_types.png',
 'conda_logo.svg',
 'test_file.png',
 'pycharm_screenshot.jpg',
 'spyder_logo.png',
 'vis_landscape.jpg',
 'spyder_screenshot.png',
 'python-logo.png',
 'anaconda_logo.png',
 'MuliIndexDataFrame.png',
 'matplotlib_anatomy.webp',
 'pycharm_logo.png']

In [4]:
os.chdir('..')  # go one directory up
os.listdir()

['4_advanced_methods_and_descriptive_statistics_with_pandas.ipynb',
 'jsonfile.json',
 '6_visualization_with_matplotlib_and_pandas.ipynb',
 'img_ML',
 '3_introduction_to_numpy_and_pandas.ipynb',
 'data',
 '7_introduction_to_ML.ipynb',
 'textfile.txt',
 '5_reading_and_writing_data.ipynb',
 '1_data_structures_in_python.ipynb',
 'img',
 '2_control_flows.ipynb',
 'picklefile.pickle',
 '0_introduction.ipynb',
 '.ipynb_checkpoints']

In [5]:
os.mkdir('test_dir')  # create a folder
os.listdir()

['4_advanced_methods_and_descriptive_statistics_with_pandas.ipynb',
 'jsonfile.json',
 '6_visualization_with_matplotlib_and_pandas.ipynb',
 'img_ML',
 'test_dir',
 '3_introduction_to_numpy_and_pandas.ipynb',
 'data',
 '7_introduction_to_ML.ipynb',
 'textfile.txt',
 '5_reading_and_writing_data.ipynb',
 '1_data_structures_in_python.ipynb',
 'img',
 '2_control_flows.ipynb',
 'picklefile.pickle',
 '0_introduction.ipynb',
 '.ipynb_checkpoints']

In [6]:
os.rename('test_dir', 'test_dir_new_name')  # rename a file or folder
os.listdir()

['4_advanced_methods_and_descriptive_statistics_with_pandas.ipynb',
 'jsonfile.json',
 '6_visualization_with_matplotlib_and_pandas.ipynb',
 'img_ML',
 '3_introduction_to_numpy_and_pandas.ipynb',
 'data',
 '7_introduction_to_ML.ipynb',
 'textfile.txt',
 '5_reading_and_writing_data.ipynb',
 '1_data_structures_in_python.ipynb',
 'img',
 '2_control_flows.ipynb',
 'test_dir_new_name',
 'picklefile.pickle',
 '0_introduction.ipynb',
 '.ipynb_checkpoints']

In [7]:
os.removedirs('test_dir_new_name')  # delete a folder
os.listdir()

['4_advanced_methods_and_descriptive_statistics_with_pandas.ipynb',
 'jsonfile.json',
 '6_visualization_with_matplotlib_and_pandas.ipynb',
 'img_ML',
 '3_introduction_to_numpy_and_pandas.ipynb',
 'data',
 '7_introduction_to_ML.ipynb',
 'textfile.txt',
 '5_reading_and_writing_data.ipynb',
 '1_data_structures_in_python.ipynb',
 'img',
 '2_control_flows.ipynb',
 'picklefile.pickle',
 '0_introduction.ipynb',
 '.ipynb_checkpoints']

# The `os.path` module
"This module implements some useful functions on pathnames."


"The path parameters can be passed as either strings, or bytes. Applications are encouraged to represent file names as (Unicode) character strings. Unfortunately, some file names may not be representable as strings on Unix, so applications that need to support arbitrary file names on Unix should use bytes objects to represent path names. Vice versa, using bytes objects cannot represent all file names on Windows (in the standard mbcs encoding), hence Windows applications should use string objects to access all files."

https://docs.python.org/3/library/os.path.html

Many methods accept *path-like objects* as input:
"**path-like object:** An object representing a file system path. A path-like object is either a ```str``` or ```bytes``` object representing a path, or an object implementing the ```os.PathLike``` protocol. An object that supports the ```os.PathLike``` protocol can be converted to a ```str``` or ```bytes``` file system path by calling the ```os.fspath()``` function; ```os.fsdecode()``` and ```os.fsencode()``` can be used to guarantee a ```str``` or ```bytes``` result instead, respectively. Introduced by PEP 519."

https://docs.python.org/3/glossary.html#term-path-like-object

The following cells introduce some basic functions. See https://docs.python.org/3/library/os.path.html for a complete list.

In [8]:
# get the current working directory
current_dir = os.getcwd()
current_dir

'/home/dan/git/u42/trainings/handling_data_in_python'

In [9]:
# check if a given object is a directory
os.path.isdir(current_dir)

True

In [10]:
# get the name of the directory
os.path.dirname(current_dir)

'/home/dan/git/u42/trainings'

In [11]:
# combine a directory and a file name
current_file = os.path.join(current_dir, 
                            '5_reading_and_writing_data.ipynb')
current_file

'/home/dan/git/u42/trainings/handling_data_in_python/5_reading_and_writing_data.ipynb'

In [12]:
# check if a file exists
os.path.isfile(current_file)

True

In [13]:
# check if a directory exists
os.path.exists(os.path.join(current_dir, 'img'))

True

In [14]:
# Return the canonical path of the specified filename, eliminating any symbolic links 
# encountered in the path (if they are supported by the operating system).

os.path.realpath('img')

'/home/dan/git/u42/trainings/handling_data_in_python/img'

In [15]:
# Return a normalized absolutized version of the pathname path. 
# On most platforms, this is equivalent to calling the function normpath() as follows:
# normpath(join(os.getcwd(), path)).

os.path.abspath('img')

'/home/dan/git/u42/trainings/handling_data_in_python/img'

In [16]:
# Normalize a pathname by collapsing redundant separators and up-level references 
# so that A//B, A/B/, A/./B and A/foo/../B all become A/B. 
# This string manipulation may change the meaning of a path that contains symbolic links. 
# On Windows, it converts forward slashes to backward slashes.

os.path.normpath('img')

'img'

In [17]:
# get the basename of a directory
os.path.basename(current_dir)

'handling_data_in_python'

In [18]:
# split file into directory path and file name
os.path.split(current_file)

('/home/dan/git/u42/trainings/handling_data_in_python',
 '5_reading_and_writing_data.ipynb')

# The `with` statement
"The with statement is used to wrap the execution of a block with methods defined by a context manager. This allows common `try…except…finally` usage patterns to be encapsulated for convenient reuse."
The context manager defines `__exit__()` and `__enter__()` methods.

After running the code in the with statement, a clean up process is executed. " The `with` statement guarantees that if the `__enter__()` method returns without an error, then `__exit__()` will always be called"

This is often used when working with files as described in the next section.

https://docs.python.org/3/reference/compound_stmts.html#the-with-statement

https://www.python.org/dev/peps/pep-0343/

`with expression as variable:
    with-block`

# Reading and Writing Files
This section describes how to read and write files (e.g. CSV files). The full documentation of reading and writing files with Python 3 can be found at https://docs.python.org/3/tutorial/inputoutput.html.


## The `open()` method
"`open()` returns a file object, and is most commonly used with two arguments: `open(filename, mode)`."
Mode can be one of the following:
* `'r'`: only reading (default)
* `'w'`: only writing (overwrites existing file)
* `'a'`: appending
* `'r+'`: reading and writing

Appending `'b'` to the mode "opens the file in *binary mode*" that "should be used for all files that don't contain text".

https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files


## Methods of File Objects

* `read(size)`: reads `size` bytes of the file (`size` by default reads the entire file into memory)
* `readline()`: "reads a single line"
* `write(string)`: :writes content of *string* to the file"

https://docs.python.org/3/tutorial/inputoutput.html#methods-of-file-objects

In [19]:
filename = 'textfile.txt'

f = open(filename, 'w')
f.write('my first line\n')
f.write('second line')
f.close()  # close file and free up system resources

In [20]:
f = open(filename, 'r')
print(f.read())
f.close()

my first line
second line


In [21]:
f = open(filename, 'r')
print(f.readline())
print(f.readline())
f.close()

my first line

second line


In [22]:
f = open(filename, 'r')
for line in f:
    print(line)
f.close()

my first line

second line


In [23]:
# Better: use the with statement to close the files automatically

with open(filename, 'r') as f:
    print(f.read())

my first line
second line


## The `json` Module
* "The standard module called `json` can take Python data hierarchies, and convert them to string representations; this process is called *serializing*."
* "Reconstructing the data from the string representation is called *deserializing*."
* "Between *serializing* and *deserializing*, the string representing the object may have been stored in a file or data, or sent over a network connection to some distant machine."

https://docs.python.org/3.3/tutorial/inputoutput.html#saving-structured-data-with-json

In [24]:
import json
# convert an object to its JSON string representation
x = [1, 'simple', 'list']
json.dumps(x)

'[1, "simple", "list"]'

In [25]:
# Serialize an object to a text file
jsonfile = 'jsonfile.json'
with open(jsonfile, 'w') as f:
    json.dump(x, f)

In [26]:
with open(jsonfile, 'r') as f:
    j = json.load(f)  # read a JSON file from disk
j

[1, 'simple', 'list']

## Reading and Writing Files with pandas
pandas can be used to read many formats such as CSV, Excel, JSON, Parquet, SQL, HDF, and many more. A full list of methods to read data is provided at http://pandas.pydata.org/pandas-docs/stable/reference/io.html. 

### Writing CSV

"Write object to a comma-separated values (csv) file."

```DataFrame.to_csv(path_or_buf=None, sep=', ', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.')```

"**Parameters:**
* **path_or_buf:** *str or file handle, default ```None```.* File path or object, if ```None``` is provided the result is returned as a string. If a file object is passed it should be opened with ```newline=’‘```, disabling universal newlines.
* **sep:** *str, default ‘,’.* String of length 1. Field delimiter for the output file.
* **na_rep:** *str, default ‘’.* Missing data representation.
* **float_format:** *str, default None.* Format string for floating point numbers.
* **columns:** *sequence, optional.* Columns to write.
* **header:** *bool or list of str, default ```True```.* Write out the column names. If a list of strings is given it is assumed to be aliases for the column names.
* **index:** *bool, default ```True```.* Write row names (index).
* ...

**Returns: ```None``` or str.** If path_or_buf is ```None```, returns the resulting csv format as a string. Otherwise returns ```None```."

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

In [27]:
import pandas as pd
# create a DataFrame
df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
                   'mask': ['red', 'purple'],
                   'weapon': ['sai', 'bo staff']})
# write the DataFrame to disk
df.to_csv('data/turtles.csv', index=False)

In [29]:
df

Unnamed: 0,name,mask,weapon
0,Raphael,red,sai
1,Donatello,purple,bo staff


### Reading CSV

"Read a comma-separated values (csv) file into DataFrame. Also supports optionally iterating or breaking of the file into chunks. Additional help can be found in the online docs for IO Tools."

```pandas.read_csv(filepath_or_buffer, sep=', ', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)```


"**Parameters:**
* **filepath_or_buffer:** *str, path object, or file-like object*. Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: file://localhost/path/to/table.csv. If you want to pass in a path object, pandas accepts either ```pathlib.Path``` or ```py._path.local.LocalPath```. By file-like object, we refer to objects with a ```read()``` method, such as a file handler (e.g. via builtin open function) or ```StringIO```.
* **sep:** *str, default ‘,’*. Delimiter to use. If sep is ```None```, the C engine cannot automatically detect the separator, but the Python parsing engine can, meaning the latter will be used and automatically detect the separator by Python’s builtin sniffer tool, ```csv.Sniffer```. In addition, separators longer than 1 character and different from ```'\s+'``` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone to ignoring quoted data. Regex example: ```'\r\t'```.
* **delimiter:** *str, default ```None```*. Alias for sep.
* ...

**Returns: DataFrame or TextParser**. A comma-separated values (csv) file is returned as two-dimensional data structure with labeled axes."

http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

In [32]:
# read a csv file from disk
pd.read_csv('data/turtles.csv')

Unnamed: 0,name,mask,weapon
0,Raphael,red,sai
1,Donatello,purple,bo staff


### Writing Excel

"Write object to an Excel sheet. To write a single object to an Excel *.xlsx* file it is only necessary to specify a target file name. To write to multiple sheets it is necessary to create an ```ExcelWriter``` object with a target file name, and specify a sheet in the file to write to. Multiple sheets may be written to by specifying unique ```sheet_name```. With all data written to the file it is necessary to save the changes. Note that creating an ```ExcelWriter``` object with a file name that already exists will result in the contents of the existing file being erased."

```DataFrame.to_excel(excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep='inf', verbose=True, freeze_panes=None)```

"**Parameters:**
* **excel_writer:** *str or ```ExcelWriter``` object*. File path or existing ```ExcelWriter```.
* **sheet_name:** *str, default ‘Sheet1’*. Name of sheet which will contain DataFrame.
* **na_rep:** *str, default ‘’*. Missing data representation.
* **float_format:** *str, optional*. Format string for floating point numbers. For example ```float_format="%.2f"``` will format 0.1234 to 0.12.
* **columns:** *sequence or list of str, optional*. Columns to write.
* **header:** *bool or list of str, default True*. Write out the column names. If a list of string is given it is assumed to be aliases for the column names.
* **index:** *bool, default ```True```*. Write row names (index).
* **index_label:** *str or sequence, optional*. Column label for index column(s) if desired. If not specified, and header and index are ```True```, then the index names are used. A sequence should be given if the DataFrame uses ```MultiIndex```.
* ..."

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html

In [35]:
# write DataFrame in .xlsx format to disk
df.to_excel('data/turtles.xlsx')

### Reading Excel

"Read an Excel file into a pandas DataFrame.

Support both *xls* and *xlsx* file extensions from a local filesystem or URL. Support an option to read a single sheet or a list of sheets."

```pandas.read_excel(io, sheet_name=0, header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skip_footer=0, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds)```

"**Parameters:**
* **io:** *str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book*. The string could be a URL. Valid URL schemes include http, ftp, s3, gcs, and file. For file URLs, a host is expected. For instance, a local file could be /path/to/workbook.xlsx.
* **sheet_name:** *str, int, list, or ```None```, default 0*. Strings are used for sheet names. Integers are used in zero-indexed sheet positions. Lists of strings/integers are used to request multiple sheets. Specify ```None``` to get all sheets. Available cases:
  * Defaults to 0: 1st sheet as a DataFrame
  * 1: 2nd sheet as a DataFrame
  * "Sheet1": Load sheet with name “Sheet1”
  * [0, 1, "Sheet5"]: Load first, second and sheet named “Sheet5” as a dict of DataFrame
  * ```None```: All sheets.
* **header:** *int, list of int, default 0*. Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will be combined into a ```MultiIndex```. Use ```None``` if there is no header.
* **names:** *array-like, default ```None```*. List of column names to use. If file contains no header row, then you should explicitly pass ```header=None```.
* ...

**Returns: DataFrame or dict of DataFrames**. DataFrame from the passed in Excel file. See notes in ```sheet_name``` argument for more information on when a dict of DataFrames is returned."
`
http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

In [38]:
# read excel file from disk
pd.read_excel('data/turtles.xlsx')

Unnamed: 0.1,Unnamed: 0,name,mask,weapon
0,0,Raphael,red,sai
1,1,Donatello,purple,bo staff


### Writing JSON

"Convert the object to a JSON string. Note ```NaN```’s and ```None``` will be converted to null and datetime objects will be converted to UNIX timestamps."

```DataFrame.to_json(path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False, compression='infer', index=True)```

"**Parameters:**
* **path_or_buf:** *string or file handle, optional*. File path or object. If not specified, the result is returned as a string.
* **orient:** *string*. Indication of expected JSON string format.
  * Series
    * default is ‘index’
    * allowed values are: {‘split’,’records’,’index’,’table’}
  * DataFrame
    * default is ‘columns’
    * allowed values are: {‘split’,’records’,’index’,’columns’,’values’,’table’}
  * The format of the JSON string
    * ‘split’ : dict like {‘index’ -> [index], ‘columns’ -> [columns], ‘data’ -> [values]}
    * ‘records’ : list like [{column -> value}, … , {column -> value}]
    * ‘index’ : dict like {index -> {column -> value}}
    * ‘columns’ : dict like {column -> {index -> value}}
    * ‘values’ : just the values array
    * ‘table’ : dict like {‘schema’: {schema}, ‘data’: {data}} describing the data, and the data component is like orient='records'.
* ..."


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html

In [39]:
# write DataFrame to disk with records indication 
df.to_json('data/turtles.json', orient='records')

### Reading JSON

"Convert a JSON string to pandas object."

```pandas.read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression='infer')```

"**Parameters:**
* **path_or_buf:** *a valid JSON string or file-like, default: ```None```*. The string could be a URL. Valid URL schemes include http, ftp, s3, gcs, and file. For file URLs, a host is expected. For instance, a local file could be file://localhost/path/to/table.json
* **orient:** *string*. Indication of expected JSON string format. Compatible JSON strings can be produced by ```to_json()``` with a corresponding orient value. The set of possible orients is:
  * 'split' : dict like {index -> [index], columns -> [columns], data -> [values]}
  * 'records' : list like [{column -> value}, ... , {column -> value}]
  * 'index' : dict like {index -> {column -> value}}
  * 'columns' : dict like {column -> {index -> value}}
  * 'values' : just the values array

  The allowed and default values depend on the value of the typ parameter.
  * when typ == 'series',
    * allowed orients are {'split','records','index'}
    * default is 'index'
    * The Series index must be unique for orient 'index'.
  * when typ == 'frame',
    * allowed orients are {'split','records','index', 'columns','values', 'table'}
    * default is 'columns'
    * The DataFrame index must be unique for orients 'index' and 'columns'.
    * The DataFrame columns must be unique for orients 'index', 'columns', and 'records'.
* ...

**Returns: result:** *Series or DataFrame*, depending on the value of typ."

http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html

In [40]:
# read json from disk
pd.read_json('data/turtles.json')

Unnamed: 0,mask,name,weapon
0,red,Raphael,sai
1,purple,Donatello,bo staff


# Pickling
* "The `pickle` module implements binary protocols for serializing and de-serializing a Python object structure."
* "*Pickling* is the process whereby a Python object hierarchy is converted into a byte stream, and *unpickling* is the inverse operation, whereby a byte stream (from a binary file or bytes-like object) is converted back into an object hierarchy."
* "*Pickling* (and *unpickling*) is alternatively known as “serialization”, “marshalling,” or “flattening”."

https://docs.python.org/3.3/library/pickle.html#module-pickle

## Pickle vs JSON

| -                     | Pickle | JSON               |
|:----------------------|:-------|:-------------------|
| Serialization         | Binary | Text               |
| Human-readable        | No     | Yes                |
| Python-specific       | Yes    | No (interoperable) |
| Python build-in types | Yes    | No (default)       |
| Custom classes        | Yes    | No                 |

https://docs.python.org/3.3/library/pickle.html#comparison-with-json


## What can be pickled and unpickled?
* "`None`, `True`, and `False`
* integers, floating point numbers, complex numbers
* strings, bytes, bytearrays
* tuples, lists, sets, and dictionaries containing only picklable objects
* functions defined at the top level of a module
* built-in functions defined at the top level of a module
* classes that are defined at the top level of a module
* instances of such classes whose `__dict__` or the result of calling `__getstate__()` is picklable (see section [Pickling Class Instances](https://docs.python.org/3.3/library/pickle.html#pickle-inst) for details)."

`PicklingError` will be raised when trying to pickle unpickle objects.

https://docs.python.org/3.3/library/pickle.html#what-can-be-pickled-and-unpickled

In [42]:
x

[1, 'simple', 'list']

In [41]:
import pickle
pickle.dumps(x)

b'\x80\x03]q\x00(K\x01X\x06\x00\x00\x00simpleq\x01X\x04\x00\x00\x00listq\x02e.'

In [43]:
picklefile = 'picklefile.pickle'
# open file in write and binary mode: 'wb'
with open(picklefile, 'wb') as f:
    pickle.dump(x, f)

In [44]:
# open the file in read and binary mode
with open(picklefile, 'rb') as f:
    p = pickle.load(f)
p

[1, 'simple', 'list']

# The `try` statement
* "The `try` statement specifies exception handlers and/or cleanup code for a group of statements"
* "The `except` clause(s) specify one or more exception handlers. When no exception occurs in the `try` clause, no exception handler is executed"`
* "An expression-less `except` clause, if present, must be last; it matches any exception"
* "The optional `else` clause is executed if the control flow leaves the `try` suite, no exception was raised, and no `return`, `continue`, or `break` statement was executed"
* "If `finally` is present, it specifies a ‘cleanup’ handler"

https://docs.python.org/3/reference/compound_stmts.html#the-try-statement

In [45]:
def f(x):
    try:
        print("Starting...")
        print("1/x={} with x={}".format(1/x, x))
        return x
    except ZeroDivisionError:
        print("ZeroDivisionError raised...")
    except Exception as e:
        print("Exception: {}".format(e))
    finally:
        print("Finishing up...")
f(1)

Starting...
1/x=1.0 with x=1
Finishing up...


1

In [46]:
f(0)

Starting...
ZeroDivisionError raised...
Finishing up...


In [48]:
f('a')

Starting...
Exception: unsupported operand type(s) for /: 'int' and 'str'
Finishing up...


More exceptions are documented at https://docs.python.org/3/tutorial/errors.html#exceptions