# 6.1 Reading and Writing Data in Text Format

In [1]:
import numpy as np
import pandas as pd

Input and output typically falls into a few main categories: reading text files and other
more efficient on-disk formats, loading data from databases, and interacting with network
sources like web APIs.

---

The optional arguments for parsing functions may fall into
a few categories:

- Indexing

    Can treat one or more columns as the returned DataFrame, and whether to get
    column names from the file, the user, or not at all.
    Type inference and data conversion
    This includes the user-defined value conversions and custom list of missing value
    markers.

- Datetime parsing

    Includes combining capability, including combining date and time information
    spread over multiple columns into a single column in the result.
    
- Iterating

    Support for iterating over chunks of very large files.
    
- Unclean data issues

    Skipping rows or a footer, comments, or other minor things like numeric data
    with thousands separated by commas.

In [2]:
df = pd.read_csv('ex1.csv')

In [3]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
pd.read_table('ex1.csv', sep=',')

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
pd.read_csv('ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [8]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


Form a hierarchical index from multiple columns

In [10]:
parsed = pd.read_csv('csv_mindex.csv', index_col=['key1', 'key2'])

In [11]:
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [14]:
result = pd.read_table('ex3.txt', sep='\s+')
# seperated by whitespace, the sep is a regular expression

  """Entry point for launching an IPython kernel.


In [15]:
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [16]:
pd.read_csv('ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [18]:
result = pd.read_csv('ex5.csv') # null between sep and NA will be treated as NaN

In [19]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [20]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


The na_values option can take either a list or set of strings to consider missing
values:

In [21]:
result = pd.read_csv('ex5.csv', na_values=['NULL'])

In [22]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [23]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ex5.csv', na_values=sentinels)
# in here we specified key as column label, and values treated as null

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


---

## Writing Data to Text Format

In [24]:
data = pd.read_csv('ex5.csv')

In [25]:
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [26]:
data.to_csv('out.csv')

In [27]:
import sys

In [28]:
data.to_csv(sys.stdout, sep='|') # sys.stdout will print to the console(?)

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [29]:
data.to_csv(sys.stdout, na_rep='NULL')
# missing values will become NULL (string)

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [30]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [31]:
dates = pd.date_range('1/1/2000', periods=7)

In [33]:
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

In [34]:
ts = pd.Series(np.arange(7), index=dates)

In [35]:
ts

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int32

In [36]:
ts.to_csv('tseries.csv')

  """Entry point for launching an IPython kernel.


---

## Working with Delimited Formats

Using python built-in csv module

In [37]:
import csv

In [44]:
f = open('ex7.csv')
type(f)

_io.TextIOWrapper

In [45]:
reader = csv.reader(f)
type(reader)

_csv.reader

In [46]:
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [47]:
# first, we read the file into a list of lines
with open('ex7.csv') as f:
    lines = list(csv.reader(f))

In [48]:
lines

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3']]

In [52]:
# split the lines into the header and the data lines
header, values = lines[0], lines[1:]

In [53]:
header

['a', 'b', 'c']

In [54]:
values

[['1', '2', '3'], ['1', '2', '3']]

In [57]:
list(zip(*values))

[('1', '1'), ('2', '2'), ('3', '3')]

In [58]:
a, b, *c = 1, 2, 3, 4, 5, 6

In [59]:
c

[3, 4, 5, 6]

In [62]:
data_dict = {h: v for h, v in zip(header, zip(*values))}

In [63]:
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}


---

## JSON Data

JSON: JavaScript Object Notation. Example:

In [66]:
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
             {"name": "Katie", "age": 38,
              "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

JSON is very nearly valid Python code with the exception of its null value null and
some other nuances (such as disallowing trailing commas at the end of lists). The
basic types are objects (dicts), arrays (lists), strings, numbers, booleans, and nulls. All
of the keys in an object must be strings

In [67]:
import json

In [72]:
type(obj)

str

In [68]:
result = json.loads(obj) # load obj (string) turn it to dict

In [69]:
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [71]:
type(result)

dict

In [73]:
# conver a python object back to JSON
asjson = json.dumps(result)

In [74]:
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [75]:
type(asjson)

str

In [77]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])

In [78]:
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


The pandas.read_json can automatically convert JSON datasets in specific arrangements
into a Series or DataFrame.

In [79]:
data = pd.read_json('example.json')

In [80]:
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [81]:
print(data.to_json())

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


---

## XML and HTML: Web Scraping

Python has many libraries for reading and writing data in the ubiquitous HTML and
XML formats. Examples include lxml, Beautiful Soup, and html5lib. While lxml is
comparatively much faster in general, the other libraries can better handle malformed
HTML or XML files.

pd.read_html, automatically parse tables out of HTML files as DataFrame objets. The pandas.read_html function has a number of options, but by default it searches for and attempts to parse all tabular data contained within table tags.

In [82]:
tables = pd.read_html('fdic_failed_bank_list.html')
# return a list

In [87]:
tables

[                                            Bank Name                City  ST  \
 0                                The Enloe State Bank              Cooper  TX   
 1                 Washington Federal Bank for Savings             Chicago  IL   
 2     The Farmers and Merchants State Bank of Argonia             Argonia  KS   
 3                                 Fayette County Bank          Saint Elmo  IL   
 4   Guaranty Bank, (d/b/a BestBank in Georgia & Mi...           Milwaukee  WI   
 5                                      First NBC Bank         New Orleans  LA   
 6                                       Proficio Bank  Cottonwood Heights  UT   
 7                       Seaway Bank and Trust Company             Chicago  IL   
 8                              Harvest Community Bank          Pennsville  NJ   
 9                                         Allied Bank            Mulberry  AR   
 10                       The Woodbury Banking Company            Woodbury  GA   
 11             