# pandas I/O tools and examples

In [1]:
import addutils.toc ; addutils.toc.js(ipy_notebook=True)

## 1 Matlab Variables

### 1.1 Import a Matlab variable from file

In [2]:
import re
import os 
import scipy.io
import numpy as np
import pandas as pd
import sqlite3
from pandas.io.data import DataFrame, read_csv
import pandas.io.sql as psql
from time import time
from IPython.display import display
from addutils import css_notebook
css_notebook()

In [3]:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

Import from '.mat' files

In [4]:
x = scipy.io.loadmat('example_data/matlab_variable.mat')

IOError: [Errno 2] No such file or directory: 'example_data/matlab_variable.mat'

In [5]:
pyA = x['a']
pyA

NameError: name 'x' is not defined

The Matlab variable is passed to a pandas DataFrame:

In [6]:
df = pd.DataFrame(pyA)
df

NameError: name 'pyA' is not defined

## 2 Importing a compressed CSV

The following example shows how to import directly a compressed csv file, in this case with multiple separators:

In [7]:
df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\,\;\.',
                     skipinitialspace=True, compression='gzip')
df_csv.head()

IOError: [Errno 2] No such file or directory: 'example_data/pd12_peopl.csv.gz'

In [8]:
coor = df_csv['Coordinates']
df_csv['lat'] = ''
df_csv['lon'] = ''
for j, coo in enumerate(coor):
    spl = re.split(',', str(coo))
    df_csv['lat'][j] = spl[0]
    df_csv['lon'][j] = spl[1]
df_csv.head()

NameError: name 'df_csv' is not defined

In [9]:
df_csv.to_json('converted_json.json')

NameError: name 'df_csv' is not defined

## 3 Importing JSON files

In [10]:
#dfjson = pd.read_json(fg.get_example('generatedSimple.json'))
dfjson = pd.read_json('converted_json.json')
dfjson.head()

ValueError: Expected object or value

Since the import reordered columns in alphabetical order, we can choose a preferred column order:

In [11]:
dfjson = dfjson.ix[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]
dfjson.head()

NameError: name 'dfjson' is not defined

## 4 Importing HTML

Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)

In [12]:
dfhtml = pd.read_html('example_data/generated.html', header=0)
dfhtml[0].head()

ImportError: html5lib not found, please install it

## 5 Importing Excel

In [13]:
dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')
dfxl.head()

IOError: [Errno 2] No such file or directory: 'example_data/generated2.xls'

## 6 Working with SQL and databases

### 6.1 Write SQL

Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:

In [14]:
con = sqlite3.connect("temp.sql")
sqlfile = dfxl.to_sql('Names', con, flavor='sqlite')

NameError: name 'dfxl' is not defined

### 6.2 Import SQL

In [15]:
con = sqlite3.connect('temp.sql')
with con:
    sql = "SELECT * FROM Names;"
    df = psql.frame_query(sql, con)
    print df.shape

DatabaseError: Execution failed on sql 'SELECT * FROM Names;': no such table: Names

In [16]:
df.head()

NameError: name 'df' is not defined

In [17]:
con.close()
os.remove("temp.sql")

## 7 Working with HDF5

### 7.1 Storer format

**HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety.

Add DataFrames to the HDFStore object:

In [18]:
samples_01 = 3e5
samples_02 = 1e5
idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')
idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')
randn = np.random.randn
df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])
df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])

In [19]:
print 'Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB'

In [20]:
%%timeit
with pd.get_store('store5.h5') as store:
    store.put('storer/df1', df1)
    store.put('storer/df2', df2)
    store.put('to_remove', df2)

Retrieve stored objects:

In [21]:
with pd.get_store('store5.h5') as store:
    print store.keys()
    df1_retrieved = store.get('storer/df1')
    print df1_retrieved[1:3]
    print df1[1:3]
    print 'Check retrieved data equal to original data: '
    print df1_retrieved[1:3].equals(df1[1:3])

Delete objects:

In [22]:
with pd.get_store('store5.h5') as store:
    try:
        store.remove('to_remove')
    except:
        pass
    print store

### 7.2 Table format

The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported.

In [23]:
with pd.get_store('store5.h5') as store:
    # store.append creates a table automatically:
    store.append('table/df1_appended', df1.ix[:10000])

In [24]:
with pd.get_store('store5.h5') as store:
    store.append('table/df1_appended', df1.ix[10001:20000])
    store.append('table/df1_appended', df1.ix[20001:50000])

### 7.3 Querying a Table

Query the table using boolean expression with in-line function evaluation:

In [25]:
with pd.get_store('store5.h5') as store:
    query01 = store.select('table/df1_appended',
                           "index>=Timestamp('2000-01-01 12:00:00.20+01:00') \
                           & index<Timestamp('2000-01-01 12:00:00.40+01:00') \
                           & columns=['A', 'B']")
query01

Unnamed: 0,A,B
2000-01-01 12:00:00.200000+01:00,-1.101898,-0.315307
2000-01-01 12:00:00.250000+01:00,0.320722,-0.566988
2000-01-01 12:00:00.300000+01:00,1.555872,0.361491
2000-01-01 12:00:00.350000+01:00,-0.977529,0.670213
2000-01-01 12:00:00.200000+01:00,1.548555,0.415718
2000-01-01 12:00:00.250000+01:00,-1.090527,0.357786
2000-01-01 12:00:00.300000+01:00,-0.264072,-0.181619
2000-01-01 12:00:00.350000+01:00,-0.227602,1.047986


---

Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.

This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.