In [1]:
import pandas as pd
import numpy as np

# initialize a new, empty DataFrame
df = pd.DataFrame()

# initialize a DataFrame with sample data
df = pd.DataFrame( {'a': [1, 2, 3, 4], 
                    'b': [1.0, 1.5, 2.0, 2.5], 
                    'c': ['a', 'b', 'c', 'd'] }, index=['r1', 'r2', 'r3', 'r4'] )

df

Unnamed: 0,a,b,c
r1,1,1.0,a
r2,2,1.5,b
r3,3,2.0,c
r4,4,2.5,d


## Series objects as Columns or Rows of DataFrame

In [None]:
s1 = df['a']       # Series([1, 2, 3, 4]):  column in dataframe
s1 = df.a          # same
print 's1:', s1

print

s2 = df.ix[0]      # Series([1, 1.0, 'a'])
print 's2:', s2

### Series:  Initializing with Index and Name

In [None]:
s1 = pd.Series([5, 6, 7, 8], index=['r1', 'r2', 'r3', 'r4'], 
                             name='numbers')
s1

### Series:  Accessing Element with indexing and slicing

In [None]:
s1[0]

In [None]:
s1[0:3]

### Series:  setting element values and type

In [None]:
s1 = pd.Series([1.5, 2.4, 3.3, 4.2, 5.1], index=['r1', 'r2', 'r3', 'r4', 'r5'])
print 's1 dtype:  ', s1.dtype
try:
    s1[0] = 'hello'
except ValueError, msg:
    print 'ValueError: ', msg

In [None]:
s2 = s1.astype('object')
s2[0] = 'hello'
s2

### Series:  Vectorized Operations

In [None]:
si = pd.Series([1, 2, 3], index=['r1', 'r2', 'r3'])
print 'si:'
print si
print
sia = si + 1
print 'sia:'
print sia

### Vectorization with Two or More Series

In [None]:
si = pd.Series([1, 2, 3], index=['r1', 'r2', 'r3'])
si2 = pd.Series([100, 200, 300], index=['r1', 'r2', 'r3'])

si + si2

##### ...but note what happens when indices do not match:  pandas lines up rows and uses empty value for mising cells

In [None]:
si = pd.Series([1, 2, 3], index=['r1', 'r2', 'r3'])
si2 = pd.Series([100, 200, 300], index=['r2', 'r3', 'r4'])

si + si2

### Mask with Series

In [None]:
si3 = pd.Series([1, 5, 100, 0, -6, -10, -100])
si3[ si3 < 0 ] = 0
si3

### Series.apply()

In [None]:
ss = pd.Series(['a', 'b', 'C', 'd'])
ssc = ss.apply(str.upper)
ssc

In [None]:
si = pd.Series([1, 2, 3, 4, 5])
sj = si.apply(lambda x: 'num_' + str(x))
print sj

### DataFrame as a container of Series objects

In [None]:
dfi = pd.DataFrame(np.arange(30).reshape(6,5))
dfi.columns = ['c1', 'c2', 'c3', 'c4', 'c5']
dfi.index = ['r1', 'r2', 'r3', 'r4', 'r5', 'r6']
dfi

In [None]:
print dfi['c1']
print
print 'type:', type(dfi['c1'])  

### DataFrame initializations

In [None]:
df6 = pd.DataFrame(  {'a': [1, 2, 3, 4], 
                      'b': [1.0, 1.5, 2.0, 2.5], 
                      'c': ['a', 'b', 'c', 'd'] }, 
                     columns=['a', 'b', 'c']  )
df6

In [None]:
dflol = pd.DataFrame([ [1, 0.5, 'a'], 
                       [2, 0.6, 'b'], 
                       [3, 0.7, 'c'] ], columns=['col1', 'col2', 'col3'],
                                        index=['r1', 'r2', 'r3'])
dflol

In [None]:
df6 = pd.DataFrame({'Nevada': {2001: 2.4, 2002: 2.9},
                    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}} )

df6

### Standard Python operations with DataFrames

In [None]:
df = pd.DataFrame( {'a': [1, 2, 3, 3], 
                    'b': [1.0, 1.5, 2.0, 2.5], 
                    'c': ['a', 'b', 'c', 'd'] }, index=['r1', 'r2', 'r3', 'r4'] )


print 'len:', len(df)

print 'len df.columns:', len(df.columns)

print 'max (col a):', max(df['a'])

print 'col "a" as list:', list(df['a'])

print 'row "r2" as list:', list(df.ix['r2'])

print 'set (col a):', set(df['a'])        # set([1, 2, 3, 4])

print
print 'looping through columns:'
for colname in df:
    print '{}:  {}'.format(colname, list(df[colname]))
print
                          # 'a':  pandas.core.series.Series
                          # 'b':  pandas.core.series.Series
                          # 'c':  pandas.core.series.Series

print 'looping through rows with iterrows():'
for index, row in df.iterrows():
    print 'row {}:  {}'.format(index, list(row))

### Index and Column Manipulation

In [None]:
# rename individual columns
df = df.rename(columns={'a': 'A'})
df = df.rename(index={'alpha': 'affa'})

# change labels wholesale
df.columns=['col1', 'col2', 'col3' 'col4']
df.index=['a', 'b', 'c', 'd']

# reset indices to integer starting with 0
df.reset_index()

# set name for index and columns
df.index.name = 'year'
df.columns.name = 'state'

print 'before reordering'
print df
print

# reindex ordering by index:  
df = df.reindex(reversed(df.index))
print 'reindex index'
print df
print

df = df.reindex(columns=reversed(df.columns))
print 'reindex columns'
print df

### Slice and Dice a DataFrame

In [None]:
df = pd.DataFrame( {'a': [1, 2, 3, 4], 
                    'b': [1.0, 1.5, 2.0, 2.5], 
                    'c': ['a', 'b', 'c', 'd'] }, index=['r1', 'r2', 'r3', 'r4'] )
df

In [None]:
df[['b', 'c']]

In [None]:
df.ix['r1']

In [None]:
df[['a', 'b']]['r1': 'r3']

In [None]:
df.ix[['r1', 'r2', 'r3']][['a', 'b']]

### Vectorized Operations on Dataframes

In [None]:
dfi = pd.DataFrame(np.arange(30).reshape(6,5),
                   columns = ['c1', 'c2', 'c3', 'c4', 'c5'],
                   index = ['r1', 'r2', 'r3', 'r4', 'r5', 'r6'])
dfi2 = dfi.copy()
dfi2

In [None]:
dfi2 * 2

In [None]:
dfi2['c1'] * 100

### Column-to-column DataFrame operations

In [None]:
dfi2['c1'] = dfi2['c1'] * 100
dfi2

In [None]:
dfi2['c1'] = dfi2['c3'] * 100
dfi2

In [None]:
dfi2['c6'] = dfi2['c5']
dfi2

In [None]:
dfm = pd.DataFrame({ 'floats': [1.3, 2.3, 3.3, 4.3], 
                     'ints': [1, 2, 3, 4], 
                     'strs': ['a', 'b', 'c', 'd']    })
dfm

### apply() and applymap()

In [None]:
dfm['y'] = dfm['strs'] + dfm['ints'].apply(str)
dfm

In [None]:
dfi3 = dfi.copy()
dfi3

In [None]:
dfi3 = dfi3 * 100
dfi3

In [None]:
dfilen = dfi.applymap(lambda x: len(str(x)))
dfilen

### mask

In [None]:
mask = dfi3['c1'] < 20
dfi3['c1'][ mask ] = 0
dfi3

### fillna()

In [None]:
nandf = pd.DataFrame( { 'c1': [6, 6, np.nan],
                        'c2': [np.nan, 1, 3],
                        'c3': [2, 2, 2] } )
nandf

In [None]:
ndf = nandf.fillna(0)
ndf

### merge

In [None]:
dfi

In [None]:
dfi2 = pd.DataFrame( { 'c1': [0, 5, 10, 15, 20, 25],
                       'c6': [41, 51, 61, 71, 81, 91],
                       'c7': [42, 52, 62, 72, 82, 92]  } )

dfi3 = dfi.merge(dfi2, on='c1', how='left')
dfi3

### group by

In [None]:
dfgb = pd.DataFrame({ 'c1': ['a', 'a', 'b', 'b', 'c', 'c'],
                      'c2': [ 6,  11,  16,  21,  26,  36 ] },
                      index=['r1', 'r2', 'r3', 'r4', 'r5', 'r6'])
dfgb

In [None]:
dfgb.groupby('c1').sum()

### List of selected groupby functions

count<BR>
mean<BR>
sum<BR>
size<BR>
describe<BR>
min<BR>
max<BR>