## Accessing Data

* from files
* CSV, JSON, HTML, Excel, and HDF5
* web and cloud
* SQL DB
* web services

In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

## CSV and Text/Tabular formats

The most commonly used format.

In [3]:
!head -n 5 data/msft.csv

Date,Open,High,Low,Close,Volume,Adj Close
2014-07-21,83.46,83.53,81.81,81.93,2359300,81.93
2014-07-18,83.30,83.40,82.52,83.35,4020800,83.35
2014-07-17,84.35,84.63,83.33,83.63,1974000,83.63
2014-07-16,83.77,84.91,83.66,84.91,1755600,84.91


In [4]:
msft = pd.read_csv('data/msft.csv')
msft.head()

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2  2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3  2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4  2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58

In [5]:
msft = pd.read_csv('data/msft.csv', index_col=0)
msft.head()

             Open   High    Low  Close   Volume  Adj Close
Date                                                      
2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58

In [6]:
msft.dtypes

Open         float64
High         float64
Low          float64
Close        float64
Volume         int64
Adj Close    float64
dtype: object

In [7]:
# specify data types
msft = pd.read_csv('data/msft.csv', index_col=0, dtype={'Volume': np.float64})
msft.dtypes

Open         float64
High         float64
Low          float64
Close        float64
Volume       float64
Adj Close    float64
dtype: object

In [8]:
# specify column names
msft = pd.read_csv('data/msft.csv', 
                   index_col=0, 
                   dtype={'Volume': np.float64},
                   names=['open', 'high', 'low', 'close', 'volume', 'adjclose'],
                   header=0)
msft.head()

             open   high    low  close   volume  adjclose
2014-07-21  83.46  83.53  81.81  81.93  2359300     81.93
2014-07-18  83.30  83.40  82.52  83.35  4020800     83.35
2014-07-17  84.35  84.63  83.33  83.63  1974000     83.63
2014-07-16  83.77  84.91  83.66  84.91  1755600     84.91
2014-07-15  84.30  84.38  83.20  83.58  1874700     83.58

In [9]:
# specify specific columns
df2 = pd.read_csv('data/msft.csv', 
                   index_col=['Date'], 
                   usecols=['Date', 'Close'])
df2.head()

            Close
Date             
2014-07-21  81.93
2014-07-18  83.35
2014-07-17  83.63
2014-07-16  84.91
2014-07-15  83.58

## Saving DataFrame to a CSV file


In [10]:
df2.to_csv('data/msft_modified.csv', index_label='date')

## General field-delimited data

CSV is actually a specific implementation of what is referred to as field-dellimited data.

In [11]:
df = pd.read_table('data/msft.csv', sep=',')
df.head()

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2  2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3  2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4  2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58

In [12]:
# and write it to a file
df.to_csv('data/msft_piped.txt', sep='|')

## Handling noise rows in field-delimited data


In [13]:
# !head data/msft2.csv
df = pd.read_csv('data/msft2.csv', skiprows=[0, 2, 3])
df

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2  2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3  2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4  2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58
5  2014-07-14  83.66  84.64  83.11  84.40  1432100      84.40
6  2014-07-11  83.55  83.98  82.85  83.35  2001400      83.35
7  2014-07-10  85.20  85.57  83.36  83.42  2713300      83.42
8  2014-07-09  84.83  85.79  84.76  85.50  1540700      85.50

In [14]:
df = pd.read_csv('data/msft_with_footer.csv', skip_footer=2, engine='python')
df

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35

In [15]:
# limit rows
pd.read_csv('data/msft.csv', nrows=3)

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2  2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63

In [16]:
pd.read_csv('data/msft.csv', skiprows=100, nrows=5,
           header=0,
           names=['open', 'high', 'low', 'close', 'vol', 'adjclose'])

             open   high    low  close      vol  adjclose
2014-03-03  80.35  81.31  79.91  79.97  5004100     77.40
2014-02-28  82.40  83.42  82.17  83.42  2853200     80.74
2014-02-27  84.06  84.63  81.63  82.00  3676800     79.36
2014-02-26  82.92  84.03  82.43  83.81  2623600     81.12
2014-02-25  83.80  83.80  81.72  83.08  3579100     80.41

## Reading and writing data in an Excel format

use pd.read_excel() or `ExcelFile` class, both use either XLRD or OpenPyXL packages.

In [17]:
df = pd.read_excel('data/stocks.xlsx')
df.head()

        Date   Open   High    Low  Close   Volume  Adj Close
0 2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1 2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2 2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3 2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4 2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58

## Reading and writing JSON files

In [18]:
df.head().to_json('data/stocks.json')

## Read HTML

In [19]:
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
banks = pd.read_html(url)

In [21]:
banks[0][:5].ix[:, 0:4]

                Bank Name            City  ST   CERT
0  Hometown National Bank        Longview  WA  35156
1     The Bank of Georgia  Peachtree City  GA  35259
2            Premier Bank          Denver  CO  34112
3          Edgebrook Bank         Chicago  IL  57772
4    Doral BankEn Espanol        San Juan  PR  32102

## HDF5

## Web and cloud



In [22]:
df = pd.read_csv('http://ichart.yahoo.com/table.csv?s=MSFT&' +
                'a=5&b=1&c=2014' +
                'd=5&e=30&f=2014&' + 
                'g=d&ignore=.csv')

In [23]:
df[:5]

         Date       Open       High        Low      Close    Volume  Adj Close
0  2015-12-18  55.770000  56.000000  54.029999  54.130001  83629500  54.130001
1  2015-12-17  56.360001  56.790001  55.529999  55.700001  41206000  55.700001
2  2015-12-16  55.540001  56.250000  54.759998  56.130001  37245900  56.130001
3  2015-12-15  55.660000  55.900002  55.090000  55.200001  39459000  55.200001
4  2015-12-14  54.330002  55.209999  53.680000  55.139999  46768900  55.139999

## SQL databases

* SQLite

In [25]:
import sqlite3

msft = pd.read_csv('data/msft.csv')
msft['Symbol'] = 'MSFT'

aapl = pd.read_csv('data/aapl.csv')
aapl['Symbol'] = 'AAPL'

conn = sqlite3.connect('data/stocks.sqlite')
msft.to_sql('STOCK_DATA', conn, if_exists='replace')
aapl.to_sql('STOCK_DATA', conn, if_exists='append')

conn.commit()
conn.close()



In [26]:
conn = sqlite3.connect('data/stocks.sqlite')
stocks = pd.io.sql.read_sql('SELECT * FROM STOCK_DATA;',
                           conn, index_col='index')
conn.close()
stocks.head()


             Date   Open   High    Low  Close   Volume  Adj Close Symbol
index                                                                   
0      2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93   MSFT
1      2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35   MSFT
2      2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63   MSFT
3      2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91   MSFT
4      2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58   MSFT

## Remote data services

In [28]:
import pandas.io.data as web
import datetime

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


In [29]:
start = datetime.datetime(2012, 1, 1)
end = datetime.datetime(2012, 1, 27)

yahoo = web.DataReader('MSFT', 'yahoo', start, end)
yahoo.head()

                 Open       High        Low      Close    Volume  Adj Close
Date                                                                       
2012-01-03  26.549999  26.959999  26.389999  26.770000  64731500  23.943792
2012-01-04  26.820000  27.469999  26.780001  27.400000  80516100  24.507280
2012-01-05  27.379999  27.730000  27.290001  27.680000  56081400  24.757720
2012-01-06  27.530001  28.190001  27.530001  28.110001  99455500  25.142323
2012-01-09  28.049999  28.100000  27.719999  27.740000  59706800  24.811385

## World Bank

In [30]:
import pandas.io.wb as wb
all_indicators = wb.get_indicators()
all_indicators.ix[:, 0:1]

The pandas.io.wb module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.
  data = data.sort(columns='id')


                                  id
0                 1.0.HCount.1.25usd
1                   1.0.HCount.10usd
2                  1.0.HCount.2.5usd
3               1.0.HCount.Mid10to50
4                    1.0.HCount.Ofcl
...                              ...
15260      per_sionl.overlap_pop_urb
15261  per_sionl.overlap_q1_preT_tot
15262       per_sionl.overlap_q1_rur
15263       per_sionl.overlap_q1_tot
15264       per_sionl.overlap_q1_urb

[15265 rows x 1 columns]