#Chapter 6 Data loading, storage, file formats

In [2]:
import pandas as pd

## Read files
+ `.read_csv(path, sep= , header= , names=[list of column names], index_col=[list of index col names],`
    `skiprows=[rows to skip], na_values=[values considered as NA, or a dict], pase_dates=[list of columns to parse],'
    `date_parser=func to parse date, nrows=rows to be read, chunksize=#of chunks desired)` - and many other formats  

In [6]:
chunker = pd.read_csv('data/ex6.csv', chunksize=1000)
tot = pd.Series([])
for pieces in chunker:
    tot=tot.add(pieces['key'].value_counts(), fill_value=0)

In [9]:
tot.order(ascending=False)

E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
V    328
I    327
U    326
P    324
D    320
A    320
R    318
Y    314
G    308
S    308
N    306
W    305
T    304
B    302
Z    288
C    286
4    171
6    166
7    164
8    162
3    162
5    157
2    152
0    151
9    150
1    146
dtype: float64

###Manually process data example
+ `csv.reader(open file, dialect=)`

In [11]:
import csv

In [16]:
f = open('data/ex7.csv')
reader = csv.reader(f)
for line in reader: print line

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


In [18]:
lines = list(csv.reader(open('data/ex7.csv')))
lines

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]

In [23]:
header, values = lines[0], lines[1:]
data_dict = {h:v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [22]:
zip(*values)

[('1', '1'), ('2', '2'), ('3', '3')]

In [25]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'

In [None]:
reader = csv.reader(f, dialect=my_dialect)
reader = csv.reader(f, delimiter='|')

##Write data

+ `df/series.to_csv('path', sys.stdout, sep='', na_rep='NULL', index=False, header=False, cols=[keys of columns desired])`  

writer = csv.writer(f, dialect=my_dialect)
writer.writerow((row content))

###JSON data

In [None]:
import json
# json to python
result = json.loads(obj)
# python to json
asjson = json.dumps(result)

###XML and HTML for web scraping

In [None]:
from lxml.html import parse
from lxml import objectify
from urllib2 import urlopen
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot() 

In [35]:
links = doc.findall('.//a') 
# extract all URL linked to this file, URL are tagged with 'a'
links[15:20]

[<Element a at 0xa7fc228>,
 <Element a at 0xa7fc278>,
 <Element a at 0xa7fc2c8>,
 <Element a at 0xa7fc318>,
 <Element a at 0xa7fc368>]

In [36]:
link = links[28]
link

<Element a at 0xa7fc638>

In [37]:
link.get('href')

'https://autos.yahoo.com/'

In [38]:
link.text_content()

'Autos'

In [41]:
urls = [link.get('href') for link in doc.findall('.//a')]
urls[:10]

['https://www.yahoo.com/',
 'https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym',
 'https://search.yahoo.com/search',
 'http://news.yahoo.com/',
 'http://sports.yahoo.com/',
 'http://finance.yahoo.com/',
 'https://weather.yahoo.com/',
 'https://games.yahoo.com/',
 'https://answers.yahoo.com/',
 'https://screen.yahoo.com/']

In [45]:
tables = doc.findall('.//table')
tables[2]

<Element table at 0xa8fd188>

###Binary data format
+ HDF5 _hierarchical data format_, best for write-once, read-many

In [None]:
df.save('path/file_name_pickle')
pd.load('path/file_name_pickle')

In [54]:
store = pd.HDFStore('data/mydata.h5')
store['obj1'] = tot
store['obj1_col1'] = tot.order(ascending=False)

In [51]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: data/mydata.h5
/obj1                 series       (shape->[36])
/obj1_col1            series       (shape->[36])

In [55]:
store['obj1'][:10]

0    151
1    146
2    152
3    162
4    171
5    157
6    166
7    164
8    162
9    150
dtype: float64

###Excel

In [None]:
xls_file = pd.ExcelFile(path) # read into excel class
table = xle_file.parse('sheet_name') # read sheet into dataframe

###Interacting with HTML and Web APIs

In [60]:
import requests
import json

Search for 'pandas' on twitter

In [76]:
url = 'https://twitter.com/search?q=python%20pandas'
resp = requests.get(url)
resp

<Response [200]>

In [None]:
data = json.loads(resp.text)
data.keys()

###Interacting with database

In [94]:
import sqlite3
import pandas.io.sql as sql

# query
query = """
CREATE TABLE test (a VARCHAR(20), b VARCHAR(20),
                   c REAL,        d INTEGER
                   );"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()

In [95]:
# insert data
data = [('a','b', 12.3, 4), ('c','d', 56.7, 8)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [96]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[(u'a', u'b', 12.3, 4), (u'c', u'd', 56.7, 8)]

In [97]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [98]:
pd.DataFrame(rows, columns=zip(*cursor.description)[0])

Unnamed: 0,a,b,c,d
0,a,b,12.3,4
1,c,d,56.7,8


In [100]:
con = sqlite3.connect(':memory:')
sql.read_frame('select * from test', con)

Unnamed: 0,a,b,c,d
0,a,b,12.3,4
1,c,d,56.7,8
