### Introduction
* Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame. 
* DataFrames are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data. 
* As well as offering a convenient storage interface for labeled data, Pandas implements a number of powerful data operations familiar to users of both database frameworks and spreadsheet programs.

In [1]:
import pandas as pd

#### Panda Objects
* Series - 1-d information or one column. NumPy Array & index information
* DataFrame - Collection of series

In [2]:
s = pd.Series([5,6,7,8,9])

In [3]:
s

0    5
1    6
2    7
3    8
4    9
dtype: int64

In [5]:
type(s.values)

numpy.ndarray

In [6]:
s.values

array([5, 6, 7, 8, 9], dtype=int64)

In [7]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [8]:
t = pd.Series([7,8,9,10,11,12], index=['abi','def','ghi','jik','kjh','qwr'])

In [9]:
t

abi     7
def     8
ghi     9
jik    10
kjh    11
qwr    12
dtype: int64

In [10]:
s[1]

6

In [11]:
t['abi']

7

In [14]:
# Difference between Series & dictionary is that, series can have same indexes repeated multiple times
t = pd.Series([7,8,9,10,11,12], index=['abi','def','ghi','jik','kjh','jik'])

In [13]:
t['jik']

jik    10
jik    12
dtype: int64

In [16]:
# You can convert dictionary to Series
db = {'mumbai':10000, 'kashmir':5000,'delhi':3000}
sb_s = pd.Series(db)

In [17]:
sb_s

delhi       3000
kashmir     5000
mumbai     10000
dtype: int64

In [18]:
sb_s.index

Index([u'delhi', u'kashmir', u'mumbai'], dtype='object')

In [21]:
sb_s['delhi':'kashmir']

delhi      3000
kashmir    5000
dtype: int64

In [23]:
sb_s['delhi'] = 7000

In [24]:
sb_s

delhi       7000
kashmir     5000
mumbai     10000
dtype: int64

In [25]:
sb_s['ghi'] = 89

In [26]:
sb_s

delhi       7000
kashmir     5000
mumbai     10000
ghi           89
dtype: int64

In [28]:
s = pd.Series([1,2,3],index=['a','b','c'])

In [29]:
s['a']

1

In [32]:
t = pd.Series(5,index=[100,200,300])

In [33]:
t[100]

5

In [34]:
t

100    5
200    5
300    5
dtype: int64

#### DataFrame Object
* Series can be analog to 1-d array, whereas Dataframe is analog to 2-d array
* Dataframe are collection of series objects

In [35]:
s1 = pd.Series([1,2,3,4])
s2 = pd.Series(['a','b','c','d'])
# no index provided so default index is 0,1,2,3

In [36]:
df = pd.DataFrame({'name':s2,'age':s1})

In [37]:
df

Unnamed: 0,age,name
0,1,a
1,2,b
2,3,c
3,4,d


In [38]:
s1 = pd.Series([1,2,3,4])
s2 = pd.Series(['a','b','c','d'],index=[4,5,6,7])

In [39]:
df = pd.DataFrame({'name':s2,'age':s1})

In [40]:
df

Unnamed: 0,age,name
0,1.0,
1,2.0,
2,3.0,
3,4.0,
4,,a
5,,b
6,,c
7,,d


In [41]:
df.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')

In [44]:
df['age']

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
5    NaN
6    NaN
7    NaN
Name: age, dtype: float64

In [45]:
import numpy as np

In [47]:
data = np.random.randint(10,size=(5,5))

In [48]:
data

array([[9, 4, 6, 6, 3],
       [6, 4, 4, 3, 7],
       [1, 0, 5, 5, 4],
       [9, 2, 3, 2, 7],
       [0, 3, 3, 5, 7]])

In [53]:
df1 = pd.DataFrame(data, columns=['a','b','c','d','e'], index=['r1','r2','r3','r4','r5'])

In [54]:
type(df1)

pandas.core.frame.DataFrame

In [55]:
df1

Unnamed: 0,a,b,c,d,e
r1,9,4,6,6,3
r2,6,4,4,3,7
r3,1,0,5,5,4
r4,9,2,3,2,7
r5,0,3,3,5,7


In [59]:
# Columed storage mechanism
df1['a']['r1']

9

In [62]:
df1[:: 2]

Unnamed: 0,a,b,c,d,e
r1,9,4,6,6,3
r3,1,0,5,5,4
r5,0,3,3,5,7


### Pandas Index Object

In [65]:
idx = pd.Index(range(5))

In [68]:
data = pd.DataFrame({'num':range(10,15)}, index=idx)

### Data Indexing & Selection

In [77]:
d = pd.Series(range(10,20), index=range(100,120,2))

In [78]:
d

100    10
102    11
104    12
106    13
108    14
110    15
112    16
114    17
116    18
118    19
dtype: int64

In [87]:
### loc, iloc & ix
# loc - access elements using configured index
# iloc - access elemts using index location
# ix - if labels r there it behaves like loc & if not behaves like iloc

In [79]:
d.loc[110]

15

In [80]:
d.iloc[2]

12

In [81]:
d = pd.Series([1,2,3,4])

In [82]:
d.ix[3]

4

In [83]:

d = pd.Series([1,2,3,4] ,index=[4,5,6,7])

In [84]:
d.ix[6]

3

d.ix[1]

### Data Selection in DataFrame

In [89]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [90]:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [91]:
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [93]:
data['pop']

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
Name: pop, dtype: int64

In [94]:
data.pop

<bound method DataFrame.pop of               area       pop
California  423967  38332521
Florida     170312  19552860
Illinois    149995  12882135
New York    141297  19651127
Texas       695662  26448193>

In [95]:
data['density'] = data['pop'] / data['area']

In [96]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [97]:
data.values

array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [98]:
data.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
pop,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [106]:
data.iloc[2:4,:2]

Unnamed: 0,area,pop
Illinois,149995,12882135
New York,141297,19651127


In [105]:
data['area'][2]

149995

In [121]:
data['pop']['Florida']

19552860

In [125]:
data.loc['Florida':'New York']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746


In [108]:
data.ix[:3,:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [109]:
d = pd.Series([1,2,3,4] ,index=['a','b','c','d'])

In [110]:
d.ix[:'c']

a    1
b    2
c    3
dtype: int64

In [111]:
d.ix[:3]

a    1
b    2
c    3
dtype: int64

In [112]:
d.ix[3]

4

In [115]:
# In loc - we are giving criteria also
data.loc[data.density > 100, ['pop','density']]

Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [114]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [116]:
data[data.density > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


In [117]:
data[1:3]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [126]:
d = pd.Series([1,2,3,4] ,index=[5,6,7,8])

In [132]:
d.ix[1:9]

5    1
6    2
7    3
8    4
dtype: int64

In [138]:
d = pd.Series(range(5,10),index=['5','6','7','8','9'])

In [134]:
d

0    5
1    6
2    7
3    8
4    9
dtype: int64

In [139]:
d.ix[:3]

5    5
6    6
7    7
dtype: int64

In [140]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [143]:
data.loc[data['density'] > 100, ['area','pop']]

Unnamed: 0,area,pop
Florida,170312,19552860
New York,141297,19651127
