In [0]:
import pandas as pd
import numpy as np

# pandas Data Strucutres

## Series
A Series is a one-dimentional array-like object, including a sequence of value (similar to NumPy array) and an associated array of *index*. 

In [0]:
obj=pd.Series([4,5,-3,2])
obj#this Series contains two part, value as numpy array([4,5,-3,2]) and index. Default index is the integer array from 0 to N-1, where N is the length of the Series. 

0    4
1    5
2   -3
3    2
dtype: int64

In [0]:
obj.values

array([ 4,  5, -3,  2])

In [0]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [0]:
#Specify a different index
obj2=pd.Series([4,5,-3,2],index=['d','c','a','b'])
obj2

d    4
c    5
a   -3
b    2
dtype: int64

In [0]:
obj2.index #object is a Python object type

Index([u'd', u'c', u'a', u'b'], dtype='object')

In [0]:
#pandas has more fexibility to use index than NumPy. 
obj2['c']

5

In [0]:
obj2[1]

5

In [0]:
obj2[['a','d']]#['a','d'] can be seen as a list of indices. It returns to a subset of the original Seires, which is also a Seiries. 

a   -3
d    4
dtype: int64

### NumPy_like operations

In [0]:
obj2[obj2>0]

d    4
c    5
b    2
dtype: int64

In [0]:
np.exp(obj2)

d     54.598150
c    148.413159
a      0.049787
b      7.389056
dtype: float64

### Create a Series from Dict
The index in the resulting Series will have the dict's keys in sorted order. 

In [0]:
food={'ham':100,'egg':200}
obj3=pd.Series(food)
obj3

egg    200
ham    100
dtype: int64

In [0]:
#We can override the Series by passing new dict keys in order. No value for
# bread, it appears as NaN meaning not a number. 
#Since 'egg' is not in new index list, it is excluded from the Seires. 
new=['bread','ham']
obj4=pd.Series(food,index=new)
obj4

bread      NaN
ham      100.0
dtype: float64

### missing data: 'missing' or 'NA'

In [0]:
pd.isnull(obj4)
#obj3.isnull()

bread     True
ham      False
dtype: bool

In [0]:
pd.notnull(obj4)

bread    False
ham       True
dtype: bool

In [0]:
obj4['bread']=300
obj4

bread    300.0
ham      100.0
dtype: float64

In [0]:
obj4+obj3

bread      NaN
egg        NaN
ham      200.0
dtype: float64

## DataFrame

There are many possible data inputs to DataFrame. Such as, np array, dict of lists ot tuples, dict of Series, dict of dicts and so on...

We only intorudce how to contruct DataFrame through dict of lists. 

In [0]:
#create a DataFrame through a dict of equal length lists or NumPy arrays:
data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
     'year':[2000,2001,2002,2000,2001,2002],
     'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame=pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2000
4,2.9,Nevada,2001
5,3.2,Nevada,2002


In [0]:
frame.head()#this method selects only the first 5 rows.

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2000
4,2.9,Nevada,2001


In [0]:
frame=pd.DataFrame(data,index=[1,2,3,4,5,6])
frame

Unnamed: 0,pop,state,year
1,1.5,Ohio,2000
2,1.7,Ohio,2001
3,3.6,Ohio,2002
4,2.4,Nevada,2000
5,2.9,Nevada,2001
6,3.2,Nevada,2002


In [0]:
frame2=pd.DataFrame(data,index=[1,2,3,4,5,6],columns=['year','state','pop','debt'])#columns are arranged in order
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,
2,2001,Ohio,1.7,
3,2002,Ohio,3.6,
4,2000,Nevada,2.4,
5,2001,Nevada,2.9,
6,2002,Nevada,3.2,


In [0]:
frame.year #notice the index has been overidden.
#frame['year']

1    2000
2    2001
3    2002
4    2000
5    2001
6    2002
Name: year, dtype: int64

In [0]:
frame2.loc[6] #Retrive a specific row

year       2002
state    Nevada
pop         3.2
debt        NaN
Name: 6, dtype: object

In [0]:
frame2['debt']=16.5
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,16.5
2,2001,Ohio,1.7,16.5
3,2002,Ohio,3.6,16.5
4,2000,Nevada,2.4,16.5
5,2001,Nevada,2.9,16.5
6,2002,Nevada,3.2,16.5


In [0]:
#how did we asign float number 1.0-6.0 to the debt. 
frame2['debt']=np.arange(1.0,7.0,1.0)
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,1.0
2,2001,Ohio,1.7,2.0
3,2002,Ohio,3.6,3.0
4,2000,Nevada,2.4,4.0
5,2001,Nevada,2.9,5.0
6,2002,Nevada,3.2,6.0


In [0]:
#if you assign Series to column in DataFrame. The labels willbe realigned exactly to the DataFrame's index, inserting missing values to the rest.
val=pd.Series([-1.2,-1.5,-1.7],index=[2,4,5])
frame2['debt']=val
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,
2,2001,Ohio,1.7,-1.2
3,2002,Ohio,3.6,
4,2000,Nevada,2.4,-1.5
5,2001,Nevada,2.9,-1.7
6,2002,Nevada,3.2,


In [0]:
#add a new column. Similar to the operation on Dict.
frame2['eastern']= frame2.state=='Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
1,2000,Ohio,1.5,,True
2,2001,Ohio,1.7,-1.2,True
3,2002,Ohio,3.6,,True
4,2000,Nevada,2.4,-1.5,False
5,2001,Nevada,2.9,-1.7,False
6,2002,Nevada,3.2,,False


In [0]:
del frame2['eastern']
frame2.columns
#frame2.drop(columns=['eastern'])

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [0]:
frame2.T # Transpose DataFrame. note that "eastern" has been deleted

Unnamed: 0,1,2,3,4,5,6
year,2000,2001,2002,2000,2001,2002
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,,-1.2,,-1.5,-1.7,


In [0]:
frame2.values
#values attribute returns the data contained in the DataFrame as a two-dimentional np array.

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2000, 'Nevada', 2.4, -1.5],
       [2001, 'Nevada', 2.9, -1.7],
       [2002, 'Nevada', 3.2, nan]], dtype=object)

## Index Objects
Index Objects in pandas are different with the regular index in array we talk about. 

Any array or other sequence of labels when cosntructing a Series or DataFrame in internally convered to an Index.

In [0]:
labels=pd.Index(['foo','foo',1,2,3,4,7])

In [0]:
labels[1]

'foo'

In [0]:
labels[1]='boo'#It is immutable. 

TypeError: Index does not support mutable operations

In [0]:
frame

Unnamed: 0,pop,state,year
1,1.5,Ohio,2000
2,1.7,Ohio,2001
3,3.6,Ohio,2002
4,2.4,Nevada,2000
5,2.9,Nevada,2001
6,3.2,Nevada,2002


In [0]:
pd.DataFrame(frame,index=labels)#a panda Index can contain duplicate lables.

Unnamed: 0,pop,state,year
foo,,,
foo,,,
1,1.5,Ohio,2000.0
2,1.7,Ohio,2001.0
3,3.6,Ohio,2002.0
4,2.4,Nevada,2000.0
7,,,


# Essential Functionality

## 1. Reindexing
Calling reindex on the Seires or DataFrame rearranges the data according to the new index, introducing missing values if any index values were not aleady presnet. 

In [0]:
#create a Seires and then apply reindex. 
obj=pd.Series([4.5,.3,-2],index=['a','c','d'])
obj

a    4.5
c    0.3
d   -2.0
dtype: float64

In [0]:
#calling reindex
obj2=obj.reindex(['a','b','c','d'])
obj2

a    4.5
b    NaN
c    0.3
d   -2.0
dtype: float64

In [0]:
#for time sires data, we can fill values when reindexing, when some data are missing.
#ffill means forward filling. 
obj3=obj.reindex(['a','b','c','d','e'],method='ffill')
obj3

a    4.5
b    4.5
c    0.3
d   -2.0
e   -2.0
dtype: float64

In [0]:
#create a DataFrame and then apply reindex for both index and columns. 
import pandas as pd
import numpy as np
frame=pd.DataFrame(np.arange(9).reshape(3,3),index=['a','b','c'],columns=['Ohio','Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [0]:
frame2=frame.reindex(['a','b','c','d'])
#note frame has not been rewritten.
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [0]:
frame2['Ohio']

a    0.0
b    3.0
c    6.0
d    NaN
Name: Ohio, dtype: float64

In [0]:
frame2.Ohio

a    0.0
b    3.0
c    6.0
d    NaN
Name: Ohio, dtype: float64

In [0]:
frame2.loc['d']=[9.0,0.5,8.0]
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,0.5,8.0


In [0]:
cities=['SF','NYC','Chichago']
frame3=frame.reindex(index=['a','c','b'],columns=cities)
frame3
frame3.SF=[9.0,1,2.5]
frame3

Unnamed: 0,SF,NYC,Chichago
a,9.0,,
c,1.0,,
b,2.5,,


## 2. Dropping Entries from an Axis

The _drop_ method will return a new object with the indicated value deleted from an axis.  

In [0]:
#Create a Series and apply drop
obj=pd.Series(np.arange(5),index=['a','b','c','d','e'])
obj.drop('c')
# obj has not been re-written. 

a    0
b    1
d    3
e    4
dtype: int64

In [0]:
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [0]:
# If you want to modify the current object, use inplace
# Be careful to use this, because this will distroy data. 
obj.drop('c',inplace=True)
obj

a    0
b    1
d    3
e    4
dtype: int64

In [0]:
# Create a DataFrame and apply drop().
frame=pd.DataFrame(np.arange(16).reshape(4,4),index=['A','B','C','D'],columns=['one','two','three','four'])
frame

Unnamed: 0,one,two,three,four
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [0]:
#Without specifying the axis, it will start from axis 0, or rows. 
frame.drop('A')

Unnamed: 0,one,two,three,four
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [0]:
frame.drop(['A','B'])

Unnamed: 0,one,two,three,four
C,8,9,10,11
D,12,13,14,15


In [0]:
#axis 1 is the column. 
frame.drop('one',axis=1)

Unnamed: 0,two,three,four
A,1,2,3
B,5,6,7
C,9,10,11
D,13,14,15


In [0]:
frame.drop(['one','two'],axis=1)

Unnamed: 0,three,four
A,2,3
B,6,7
C,10,11
D,14,15


## 3. Indexing and Selection

Series indexing works analogously to NumPy array indexing, excepting that you can use the Series' index value instead of only integers. 

In [0]:
import pandas as pd
import numpy as np
obj=pd.Series(np.arange(5),index=['a','b','c','d','e'])
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [0]:
obj[2]

2

In [0]:
obj['c']

2

Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence. 

In [0]:
frame=pd.DataFrame(np.arange(16).reshape(4,4),index=['A','B','C','D'],columns=['one','two','three','four'])
frame

Unnamed: 0,one,two,three,four
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [0]:
# this syntax slices based on axis 0 or row. 
frame[:2]

Unnamed: 0,one,two,three,four
A,0,1,2,3
B,4,5,6,7


In [0]:
frame['two']

A     1
B     5
C     9
D    13
Name: two, dtype: int64

In [0]:
frame.two

A     1
B     5
C     9
D    13
Name: two, dtype: int64

In [0]:
frame[['one','two']]

Unnamed: 0,one,two
A,0,1
B,4,5
C,8,9
D,12,13
