## DataFrames
-------
Explore all methods and feautures available for DataFrame based operations

### Background
-----

In [28]:
import sys
import pandas as pd
import numpy as np
from io import StringIO
from pandas.io.json import json_normalize
import json

versions = [sys.version, pd.__version__, np.__version__, json.__version__]
packages = ['Python', 'Pandas', 'Numpy', 'json']
pd.DataFrame( data = {'Module' : packages, 'Version' : versions})

Unnamed: 0,Module,Version
0,Python,"3.5.2 |Anaconda 4.2.0 (64-bit)| (default, Jul ..."
1,Pandas,0.18.1
2,Numpy,1.14.2
3,json,2.0.9


### Create a DataFrame
------

- _DataFrame of m rows and n cols_

In [5]:
pd.DataFrame(np.random.randn(6,4), columns=list('ABCD'))

Unnamed: 0,A,B,C,D
0,-1.147596,-1.031972,-0.431476,0.878827
1,-1.089836,-0.274343,-0.413604,0.011334
2,0.594276,-0.103832,1.974724,2.190938
3,0.757672,0.947165,-0.179716,0.986005
4,0.137946,0.183068,0.021536,-0.534207
5,-0.402312,-0.183192,-0.616182,0.85309


In [12]:
pd.DataFrame({ 'A' : 1,
                'B1' : pd.Timestamp('20130102'), 
                'B2' : pd.date_range('20130101', periods=4), 
                'C' : pd.Series(1, index=list(range(4)), dtype='float32'), 
                'D' : np.array([3] * 4,dtype='int32'), 
                'E' : pd.Categorical(["test","train","test","train"]) })

Unnamed: 0,A,B1,B2,C,D,E
0,1,2013-01-02,2013-01-01,1.0,3,test
1,1,2013-01-02,2013-01-02,1.0,3,train
2,1,2013-01-02,2013-01-03,1.0,3,test
3,1,2013-01-02,2013-01-04,1.0,3,train


In [14]:
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [15]:
data = 'col1;col2;col3\na;b;1\na;b;2\nc;d;3'
pd.read_csv(StringIO(data), sep=";")

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [16]:
example = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', ".", 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, ".", "."],
        'postTestScore': ["25,000", "94,000", 57, 62, 70]}
df = pd.DataFrame(example, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df.to_csv('../data/example.csv')
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25000
1,Molly,Jacobson,52,24,94000
2,Tina,.,36,31,57
3,Jake,Milner,24,.,62
4,Amy,Cooze,73,.,70


In [19]:
pd.read_csv

<function pandas.io.parsers._make_parser_function.<locals>.parser_f>

In [17]:
pd.read_csv('../data/example.csv')

Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,0,Jason,Miller,42,4,25000
1,1,Molly,Jacobson,52,24,94000
2,2,Tina,.,36,31,57
3,3,Jake,Milner,24,.,62
4,4,Amy,Cooze,73,.,70


In [18]:
pd.read_csv('../data/example.csv', header=None)

Unnamed: 0,0,1,2,3,4,5
0,,first_name,last_name,age,preTestScore,postTestScore
1,0.0,Jason,Miller,42,4,25000
2,1.0,Molly,Jacobson,52,24,94000
3,2.0,Tina,.,36,31,57
4,3.0,Jake,Milner,24,.,62
5,4.0,Amy,Cooze,73,.,70


In [20]:
pd.read_csv('../data/example.csv', names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])

Unnamed: 0,UID,First Name,Last Name,Age,Pre-Test Score,Post-Test Score
0,,first_name,last_name,age,preTestScore,postTestScore
1,0.0,Jason,Miller,42,4,25000
2,1.0,Molly,Jacobson,52,24,94000
3,2.0,Tina,.,36,31,57
4,3.0,Jake,Milner,24,.,62
5,4.0,Amy,Cooze,73,.,70


In [21]:
 pd.read_csv('../data/example.csv', 
                 index_col=['First Name', 'Last Name'], 
                 names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])

Unnamed: 0_level_0,Unnamed: 1_level_0,UID,Age,Pre-Test Score,Post-Test Score
First Name,Last Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
first_name,last_name,,age,preTestScore,postTestScore
Jason,Miller,0.0,42,4,25000
Molly,Jacobson,1.0,52,24,94000
Tina,.,2.0,36,31,57
Jake,Milner,3.0,24,.,62
Amy,Cooze,4.0,73,.,70
