#  READING AND WRITING DATA USING PANDAS

## Reading Data in CSV or Text Files

In [None]:
import numpy as np
import pandas as pd

In [None]:
csvframe = pd.read_csv('01.csv')
csvframe

In [None]:
pd.read_table('ch05_01.csv', sep=',')

In [None]:
pd.read_csv('ch05_02.csv')

In [None]:
pd.read_csv('ch05_02.csv', header=None)

In [None]:
pd.read_csv('ch05_02.csv', names=['white','red','blue','green','animal'])

In [None]:
pd.read_csv('ch05_03.csv', index_col=['color','status'])

###  Using RegExp for Parsing TXT Files

In [None]:
pd.read_table('ch05_04.txt', sep='\s+', engine='python')

In [None]:
pd.read_table('ch05_05.txt', sep='\D+', header=None, engine='python')

In [None]:
pd.read_table('ch05_06.txt', sep=',', skiprows=[0,1,3,6])

### Reading TXT Files into Parts or Partially

In [None]:
pd.read_csv('ch05_02.csv',skiprows=[2],nrows=3,header=None)

In [None]:
out = pd.Series()
i = 0
pieces = pd.read_csv('ch05_01.csv', chunksize=3)
for piece in pieces:
    out.set_value(i, piece['white'].sum())
    i = i + 1
out

### Writing Data in CSV

In [None]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
          index=['red', 'blue', 'yellow', 'white'],
          columns=['ball','pen','pencil','paper'])
frame

In [None]:
frame.to_csv('ch05_07.csv')
frame2 = pd.read_csv('ch05_07.csv')
frame2

In [None]:
frame.to_csv('ch05_07b.csv', index=False, header=False)
frame2 = pd.read_csv('ch05_07b.csv')
frame2

In [None]:
frame3 = pd.DataFrame([[6,np.nan,np.nan,6,np.nan],
              [np.nan,np.nan,np.nan,np.nan,np.nan],
              [np.nan,np.nan,np.nan,np.nan,np.nan],
              [20,np.nan,np.nan,20.0,np.nan],
              [19,np.nan,np.nan,19.0,np.nan]
             ],
                     index=['blue','green','red','white','yellow'],
                     columns=['ball','mug','paper','pen','pencil'])

In [None]:
frame3.to_csv('ch5_08.csv')
frame4 = pd.read_csv('ch5_08.csv')
frame4

In [None]:
frame3.to_csv('ch5_09.csv', na_rep = 'NaN')
frame5 = pd.read_csv('ch5_09.csv')
frame5

## Reading and Writing HTML Files

### Writing Data in HTML

In [None]:
frame = pd.DataFrame(np.arange(4).reshape(2,2))
print(frame.to_html())

In [None]:
frame = pd.DataFrame( np.random.random((4,4)),
                    index = ['white','black','red','blue'],
                    columns = ['up','down','right','left'])
frame

In [None]:
s = ['<HTML>']
s.append('<HEAD><TITLE>My DataFrame</TITLE></HEAD>')
s.append('<BODY>')
s.append(frame.to_html())
s.append('</BODY></HTML>')
html = ''.join(s)

In [None]:
html_file = open('myFrame.html','w')
html_file.write(html)
html_file.close()

### Reading Data from an HTML File

In [None]:
web_frames = pd.read_html('myFrame.html')
web_frames[0]

In [None]:
ranking = pd.read_html('https://www.meccanismocomplesso.org/en/meccanismo-complesso-sito-2/classifica-punteggio/')
ranking[0]

### Reading Data from XML

In [None]:
from lxml import objectify
xml = objectify.parse('books.xml')
xml

In [None]:
root = xml.getroot()

In [None]:
root.Book.Author

In [None]:
root.Book.PublishDate

In [None]:
root.getchildren()

In [None]:
[child.tag for child in root.Book.getchildren()]

In [None]:
[child.text for child in root.Book.getchildren()]

In [None]:
def etree2df(root):
    column_names = []
    for i in range(0, len(root.getchildren()[0].getchildren())):
        column_names.append(root.getchildren()[0].getchildren()[i].tag)
    xmlframe = pd.DataFrame(columns=column_names)
    for j in range(0, len(root.getchildren())):
        obj = root.getchildren()[j].getchildren()
        texts = []
        for k in range(0, len(column_names)):
            texts.append(obj[k].text)
        row = dict(zip(column_names, texts))
        row_s = pd.Series(row)
        row_s.name = j
        xmlframe = xmlframe.append(row_s)
    return xmlframe

In [None]:
etree2df(root)

## Reading and Writing Data on Microsoft Excel Files

In [None]:
pd.read_excel('ch05_data.xlsx')

In [None]:
pd.read_excel('ch05_data.xlsx','Sheet2')

In [None]:
pd.read_excel('ch05_data.xlsx',1)

In [None]:
frame = pd.DataFrame(np.random.random((4,4)),
                    index = ['exp1','exp2','exp3','exp4'],
                    columns = ['Jan2015','Feb2015','Mar2015','Apr2015'])
frame

In [None]:
frame.to_excel('ch05_data02.xlsx')

## JSON Data

In [None]:
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                    index=['white','black','red','blue'],
                    columns=['up','down','right','left'])
frame.to_json('frame.json')

In [None]:
pd.read_json('frame.json')

In [None]:
from pandas.io.json import json_normalize

In [None]:
file = open('books.json','r')
text = file.read()
text = pd.io.json.loads(text)

In [None]:
json_normalize(text,'books')

In [None]:
json_normalize(text,'books',['writer','nationality'])

## The Format HDF5

In [None]:
from pandas.io.pytables import HDFStore

In [None]:
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                    index=['white','black','red','blue'],
                    columns=['up','down','right','left'])
store = HDFStore('ch05_data.h5')
store['obj1'] = frame

In [None]:
frame2 = pd.DataFrame(np.arange(0,8,0.5).reshape(4,4))
frame2

In [None]:
store['obj2'] = frame2

In [None]:
store

In [None]:
store['obj2']

## Pickle - Python Object Serialization

### Serialize a Python Object with cPickle

In [None]:
#On python3.x cPickle has changed from cPickle to _pickle. Thus in python3.x, you can do the following if you want to use cPickle:
import _pickle as pickle

In [None]:
data = { 'color': ['white','red'], 'value': [5, 7]}
pickled_data = pickle.dumps(data)
pickled_data

In [None]:
print(pickled_data)

In [None]:
nframe = pickle.loads(pickled_data)
nframe

### Pickling with pandas

In [None]:
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                    index=['up','down','left','right'])
frame.to_pickle('frame.pkl')

In [None]:
pd.read_pickle('frame.pkl')

## Interacting with Databases

In [None]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase')

### Loading and Writing Data with SQLite3

In [None]:
frame = pd.DataFrame(np.arange(20).reshape(4,5),
                    columns=['white','red','blue','black','green'])
frame

In [None]:
engine = create_engine('sqlite:///foo.db')

In [None]:
frame.to_sql('colors',engine)

In [None]:
pd.read_sql('colors',engine)

In [None]:
import sqlite3
query = """
         CREATE TABLE test
         (a VARCHAR(20), b VARCHAR(20),
          c REAL,        d INTEGER
         );"""
con = sqlite3.connect(':memory:')
con.execute(query)

In [None]:
con.commit()

In [None]:
data = [('white','up',1,3),
        ('black','down',2,8),
        ('green','up',4,4),
        ('red','down',5,5)]
stmt = "INSERT INTO test VALUES(?,?,?,?)"
con.executemany(stmt, data)

In [None]:
con.commit()

In [None]:
cursor = con.execute('select * from test')
cursor

In [None]:
rows = cursor.fetchall()
rows

In [None]:
cursor.description

In [None]:
pd.DataFrame(rows, columns=zip(*cursor.description)[0])

### Loading and Writing Data with PostgreSQL

In [None]:
pd.__version__

In [None]:
import psycopg2
engine = create_engine('postgresql://postgres:password@localhost:5432/postgres')

In [None]:
frame = pd.DataFrame(np.random.random((4,4)),
                    index=['exp1','exp2','exp3','exp4'],
                    columns=['feb','mar','apr','may']);
frame.to_sql('dataframe',engine)

In [None]:
pd.read_sql_table('dataframe',engine)

In [None]:
pd.read_sql_query('SELECT index,apr,may FROM DATAFRAME WHERE apr > 0.5', engine)

## Reading and Writing Data with a NoSQL Database: MongoDB

In [None]:
import pymongo
client = MongoClient('localhost',27017)

In [None]:
db = client.mydatabase
db

In [None]:
client['mydatabase']

In [None]:
collection = db.mycollection
db['mycollection']

In [None]:
collection

In [None]:
frame = pd.DataFrame( np.arange(20).reshape(4,5), 
                      columns = ['white','red','blue','black','green'])
frame

In [None]:
import json
record = json.loads(frame.T.to_json()).values
record

In [None]:
collection.mydocument.insert(record)

In [None]:
cursor = collection['mydocument'].find()
dataframe = (list(cursor))
del dataframe['_id']
dataframe