# 5. Getting Started with pandas

In [75]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

ss = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
ss1 = Series({'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000})

# can be altered in place
ss.name = 'series name'
ss.index.name = 'index name'
ss.index = ['d', 'b', 'a', 'c']

# values is read only 
ss.values

# get, set, slicing. With labels the endpoint is inclusive
ss[1]
ss[[1, 3]]
ss[2:4]
ss['c']
ss[['c', 'a', 'd']]
ss['b':'c']

# selection
ss[ss > 3]

# membership
'b' in ss

ss*2
np.exp(ss)

# align index automatically; by default a+b = NaN if either a or b is missing
ss + ss1

ss.isnull()

# reindexing with Series with reindex or loc; Auto-alignment with specified order
# with reindex can also specify fill_value or method (ffill for forward fill and bfill for backward). 
# If not specified, get NaN
ss = ss.reindex(['a', 'b', 'c', 'd', 'e'])
ss = ss.reindex(['a', 'b', 'c', 'd', 'e', 'f'], fill_value=0)
ss = ss.loc[['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']]

# Arithmetic and data alignment
# by default get NaN if sum over at least one missing value
# if use .add method, can treat missing values as fill_value
ss1 = Series([1, 2, 3], index=['a', 'b', 'c'])
ss2 = Series([4, 5, 6], index=['b', 'c', 'd'])
ss1 + ss2
ss1.add(ss2, fill_value=0)

print




In [7]:
pd.isnull(ss)
pd.notnull(ss)

d    True
b    True
a    True
c    True
Name: series name, dtype: bool

In [80]:
# When initializing a df, columns and index are in specific order if specified. If not, column can be in any order. 
# other DataFrame constructor; see Table 5-1 on p.120
df = DataFrame(data={'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 
                    'year' : [2000, 2001, 2002, 2001, 2002], 
                    'pop' : [1.5, 1.7, 3.6, 2.4, 2.9]}, 
               columns=['year', 'state', 'pop', 'debt'], 
               index=['one', 'two', 'three', 'four', 'five']
              )

# can be altered in place
df.columns
df.columns.name
df.index.name

# values is a read only 2D numpy array
df.values

# the state column is a Series with name 'state'
df['state']
df.state

# get rows; slicing means row slicing! 
df.loc['three']
df[:2]

# selection
df[df['pop'] > 2]

# see head or tail
df[:10]
df[-10:]
df.head()
df.tail()

# set a column: with Series the data alignment is done automatically
df['debt'] = 16.5
df['debt'] = np.arange(5.)
df['debt'] = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

# delete a column
del df['state']
df.drop('debt', axis=1, inplace=True)
df.drop(['two', 'three', 'four'])

df.T

# reindex; With method specified (say 'ffill'), interpolation is applied to columns
# there is also iloc which only accepts integers instead of labels
df = df.reindex(index=['one', 'two', 'three', 'four', 'five', 'six'])
df = df.reindex(index=['one', 'two', 'three', 'four', 'five', 'six', 'seven'], columns=['year', 'pop', 'area'])
df = df.loc[['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'], ['year', 'pop', 'area', 'debt']]

# swap two columns
df[['year', 'pop']] = df[['pop', 'year']]

# Arithmetic; similar to Series. There are also broadcasting with DataFrame-Series operations, skipped

# specify an column as index, and the inverse operation
df = df.set_index('year')
df.reset_index()

print




In [14]:
# both df.column and df.index are of index type
type(df.columns)
type(df.index)

index = df.index

# read only; the following won't work! Use rename instead e.g. rename 'one' as 'd'
# index[1] = 'd'
df.rename(columns={'one':'d'}, inplace=True)

# slicing
index[1:]

# membership
'year' in df.columns

print


