# Pandas

While pandas adopts many coding idioms from NumPy, the biggest difference is that pandas is designed for working with tabular or heterogeneous data. NumPy, by con‐ trast, is best suited for working with homogeneous numerical array data

In [2]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## Series

In [4]:
# A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index
obj = pd.Series([4,7,-5,3])
print(obj)

0    4
1    7
2   -5
3    3
dtype: int64


In [7]:
# You can get the array representation and index object of the Series via its values and index attributes, respectively
print(obj.values)
print(obj.index)

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


In [10]:
# Often it will be desirable to create a Series with an index identifying each data point with a label
obj2 = pd.Series([4,7,-5,3], index = ['d','b','a','c'])
print(obj2)
print(obj2.index)
print(obj2.values)

d    4
b    7
a   -5
c    3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')
[ 4  7 -5  3]


In [14]:
# Compared with NumPy arrays, you can use labels in the index when selecting single values or a set of values:
print(obj2['a'])
print(obj2[2])
print(obj2[['c','a','d']])

-5
-5
c    3
a   -5
d    4
dtype: int64


In [15]:
print(obj2[obj2 > 0])

d    4
b    7
c    3
dtype: int64


In [16]:
print(obj2 * 2)

d     8
b    14
a   -10
c     6
dtype: int64


In [18]:
import numpy as np
print(np.exp(obj2))

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [21]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a map‐ ping of index values to data values. It can be used in many contexts where you might use a dict
print('b' in obj2)
print('e' in obj2)

True
False


In [26]:
# we can create Series from Dictionary
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
print(obj3)
print(obj3.index)
print(obj3.values)

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64
Index(['Ohio', 'Oregon', 'Texas', 'Utah'], dtype='object')
[35000 16000 71000  5000]


In [27]:
# Here dictionary keys in sorted order. You can override this by passing the dict keys in the order you want them to appear in the resulting Series
states = ['California', 'Ohio', 'Texas', 'Oregon']
obj4 = pd.Series(sdata, index = states)
print(obj4)
# three values found in sdata were placed in the appropriate locations, but since no value for 'California' was found, it appears as NaN (not a number), which is con‐ sidered in pandas to mark missing or NA values. Since 'Utah' was not included in states, it is excluded from the resulting object


California        NaN
Ohio          35000.0
Texas         71000.0
Oregon        16000.0
dtype: float64


In [33]:
# The isnull and notnull functions in pandas should be used to detect missing data
print(pd.isnull(obj4))
print(obj4.isnull())
print(obj4.notnull())

California     True
Ohio          False
Texas         False
Oregon        False
dtype: bool
California     True
Ohio          False
Texas         False
Oregon        False
dtype: bool
California    False
Ohio           True
Texas          True
Oregon         True
dtype: bool


In [34]:
# Both the Series object itself and its index have a name attribute, which integrates with other key areas of pandas functionality
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)

state
California        NaN
Ohio          35000.0
Texas         71000.0
Oregon        16000.0
Name: population, dtype: float64


In [36]:
# A Series’s index can be altered in-place by assignment
print(obj)
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

0    4
1    7
2   -5
3    3
dtype: int64
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


## DataFrame

A DataFrame represents a rectangular table of data and contains an ordered collec‐ tion of columns, each of which can be a different value type (numeric, string, boolean, etc.)

In [3]:
# There are many ways to construct a DataFrame, though one of the most common is from a dict of equal-length lists or NumPy arrays
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
print(frame)

   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002
5  3.2  Nevada  2003


In [4]:
# For large DataFrames, the head method selects only the first five rows
print(frame.head(3))

   pop state  year
0  1.5  Ohio  2000
1  1.7  Ohio  2001
2  3.6  Ohio  2002


In [5]:
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order
print(pd.DataFrame(data, columns = ['year', 'state', 'pop']))

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2


In [7]:
# If you pass a column that isn’t contained in the dict, it will appear with missing values in the result
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'],
                     index = ['one', 'two', 'three', 'four','five', 'six'])
print(frame2)
print('---frame2.isnull():', '\n', frame2.isnull())
print('---frame2.isnull().sum():', '\n', frame2.isnull().sum(axis = 0))
print('---frame2.isnull().any():', '\n', frame2.isnull().any())
print('---frame2.dtypes:', '\n', frame2.dtypes)

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN
---frame2.isnull(): 
         year  state    pop  debt
one    False  False  False  True
two    False  False  False  True
three  False  False  False  True
four   False  False  False  True
five   False  False  False  True
six    False  False  False  True
---frame2.isnull().sum(): 
 year     0
state    0
pop      0
debt     6
dtype: int64
---frame2.isnull().any(): 
 year     False
state    False
pop      False
debt      True
dtype: bool
---frame2.dtypes: 
 year       int64
state     object
pop      float64
debt      object
dtype: object


In [60]:
print(frame2.columns)

Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [62]:
# A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute
print(frame2['state'])
print(frame2.state)
# Try frame2.<tab>

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object


In [15]:
#Rows can also be retrieved by position or name with the special loc attribute
print(frame2.loc['three'])
print(type(frame2.loc['three']))
print(frame2.loc['three'].index)
print(frame2.loc['three'].values)

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
<class 'pandas.core.series.Series'>
Index(['year', 'state', 'pop', 'debt'], dtype='object')
[2002 'Ohio' 3.6000000000000001 nan]


In [67]:
# Columns can be modified by assignment. For example, the empty 'debt' column could be assigned a scalar value or an array of values
frame2.debt = 16.5
print(frame2)
frame2['debt'] = np.arange(6.0)
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
six    2003  Nevada  3.2  16.5
       year   state  pop  debt
one    2000    Ohio  1.5   0.0
two    2001    Ohio  1.7   1.0
three  2002    Ohio  3.6   2.0
four   2001  Nevada  2.4   3.0
five   2002  Nevada  2.9   4.0
six    2003  Nevada  3.2   5.0


In [69]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes
val = pd.Series([-1.2,-1.5,-1.7], index=['two','four','five'])
print(val)
frame2['debt'] = val
print(frame2)

two    -1.2
four   -1.5
five   -1.7
dtype: float64
       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN


In [22]:
# Assigning a column that doesn’t exist will create a new column. The del keyword will delete columns as with a dict
frame2['eastern'] = frame2.state == 'Ohio'
print(frame2)
# ! New columns cannot be created with the frame2.eastern syntax

       year   state  pop debt  eastern
one    2000    Ohio  1.5  NaN     True
two    2001    Ohio  1.7  NaN     True
three  2002    Ohio  3.6  NaN     True
four   2001  Nevada  2.4  NaN    False
five   2002  Nevada  2.9  NaN    False
six    2003  Nevada  3.2  NaN    False


In [23]:
frame2.drop('eastern', axis = 1, inplace = True) # inplace: if false drop does'n modify DF itself
# to delete column
#del frame2['eastern']
print(frame2.columns)
print(frame2)

Index(['year', 'state', 'pop', 'debt'], dtype='object')
       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN


In [25]:
# The column returned from indexing a DataFrame is a view on the underlying data, not a copy. Thus, any in-place modifications to the Series will be reflected in the DataFrame. The column can be explicitly copied with the Series’s copy method
# modify lowercase to uppercase for first letter
frame2.columns = ['Year', 'State', 'Pop', 'Debt']
print(frame2.columns)
frame2.columns = ['year', 'state', 'pop', 'debt']
print(frame2.columns)

Index(['Year', 'State', 'Pop', 'Debt'], dtype='object')
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [4]:
import pandas as pd
# Another common form of data is a nested dict of dicts
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002:3.6}}
frame3 = pd.DataFrame(pop)
print(frame3)

      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6


In [5]:
# To transpose dataframe
print(frame3.T)

        2000  2001  2002
Nevada   NaN   2.4   2.9
Ohio     1.5   1.7   3.6


In [6]:
# if an explicit index is specified behaviour is different
print(pd.DataFrame(pop, index = [2001,2002,2006, 20012]))


       Nevada  Ohio
2001      2.4   1.7
2002      2.9   3.6
2006      NaN   NaN
20012     NaN   NaN


In [7]:
# Dicts of Series are treated in much the same way
print(frame3['Ohio'][:-1])
pdata = {'Ohio': frame3['Ohio'][:-1],
        'Nevada': frame3['Nevada'][:2]}
print(DataFrame(pdata))

2000    1.5
2001    1.7
Name: Ohio, dtype: float64


NameError: name 'DataFrame' is not defined

In [8]:
# If a DataFrame’s index and columns have their name attributes set, these will also be displayed
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)

state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6


In [11]:
# As with Series, the values attribute returns the data contained in the DataFrame as a two-dimensional ndarray
print(frame3.values)
print(frame3.values.dtype)

[[nan 1.5]
 [2.4 1.7]
 [2.9 3.6]]
float64


In [92]:
# If the DataFrame’s columns are different dtypes, the dtype of the values array will be chosen to accommodate all of the columns
print(frame2)
print(frame2.values)

       Year   State  Pop  Debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN
[[2000 'Ohio' 1.5 nan]
 [2001 'Ohio' 1.7 -1.2]
 [2002 'Ohio' 3.6 nan]
 [2001 'Nevada' 2.4 -1.5]
 [2002 'Nevada' 2.9 -1.7]
 [2003 'Nevada' 3.2 nan]]


### Reindexing

In [97]:
# An important method on pandas objects is reindex, which means to create a new object with the data conformed to a new index
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
print(obj)
# Calling reindex on this Series rearranges the data according to the new index, intro‐ ducing missing values if any index values were not already present
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [100]:
# For ordered data like time series, it may be desirable to do some interpolation or fill‐ ing of values when reindexing. The method option allows us to do this, using a method such as ffill, which forward-fills the values:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0,2,4])
print(obj3)
print(obj3.reindex(range(6), method = 'ffill'))

0      blue
2    purple
4    yellow
dtype: object
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


In [103]:
# With DataFrame, reindex can alter either the (row) index, columns, or both. When passed only a sequence, it reindexes the rows in the result
frame = pd.DataFrame(np.arange(9).reshape(3,3), 
                     index = ['a','c','d'], 
                     columns=['Ohio', 'Texas', 'California'])
print(frame)
frame2 = frame.reindex(['a','b','c','d'])
print(frame2)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


In [104]:
# The columns can be reindexed with the columns keyword
print(frame.reindex(columns = ['Texas', 'Utah', 'California']))
# ! Do not mix with column renaming: reindexing - change positions and add new ones

   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


### Dropping Entries from an Axis

In [5]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
print(obj)
print(obj.drop('c')) # obj iteself isn't modified
print(obj.drop(['c','d'])) # obj iteself isn't modified

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
e    4.0
dtype: float64


In [9]:
# With DataFrame, index values can be deleted from either axis. To illustrate this, we first create an example DataFrame:
df = pd.DataFrame(np.arange(16).reshape(4,4),
                  index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns = ['one', 'two', 'three', 'four'])
print(df)
# to drop rows
print(df.drop(['Colorado', 'Ohio']))
# to drop columns
print(df.drop('two', axis = 1))
print(df.drop('two', axis = 'columns'))

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


In [12]:
# Many functions, like drop, which modify the size or shape of a Series or DataFrame, can manipulate an object in-place without returning a new object
print(df.drop('two', axis = 'columns'))
print(df)
print(df.drop('two', axis = 'columns', inplace = True))
print(df)

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
None
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


### Indexing, Selection, and Filtering

In [21]:
# Indexing Series
obj = pd.Series(np.arange(4.), index=['a','b','c','d'])
print(obj)
print(obj['c'])
print(obj[1])
print(obj[['b','d','a']])
print(obj[[3,1]])
print(obj['b' :'c']) # 'c' included! difference from slicing in np
obj['b':'c'] = 13
print(obj)

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
2.0
1.0
b    1.0
d    3.0
a    0.0
dtype: float64
d    3.0
b    1.0
dtype: float64
b    1.0
c    2.0
dtype: float64
a     0.0
b    13.0
c    13.0
d     3.0
dtype: float64


In [33]:
# Indexing dataframes
df = pd.DataFrame(np.arange(16).reshape(4,4),
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])
print(df)
print(df['two']) # here we select columns!
print("*** df[['three', 'one']]", '\n', df[['three', 'one']])
print("*** df[:2]", '\n', df[:2]) # here we select rows!
print("*** df[df['three'] > 5]", '\n', df[df['three'] > 5])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
*** df[['three', 'one']] 
           three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
*** df[:2] 
           one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
*** df[df['three'] > 5] 
           one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [35]:
# indexing with a boolean DataFrame
print(df < 5)
df[df < 5] = 0
print(df)

            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


### Selection with loc and iloc

In [46]:
# For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc).
print(df.loc['Colorado', ['two', 'three']])
print(type(df.loc['Colorado', ['two', 'three']]))
# print(df['Colorado']) # error
#print(df[df.index == 'Colorado']) # works! so weird!
print(df[df.index == 'Colorado'][['two', 'three']])

two      5
three    6
Name: Colorado, dtype: int64
<class 'pandas.core.series.Series'>
          two  three
Colorado    5      6


In [48]:
# We’ll then perform some similar selections with integers using iloc
df.iloc[2, [3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [51]:
print(df.iloc[2]) # works on index!
#print(df[2]) # dosn't work

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64


In [52]:
df.iloc[[1,2], [3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [53]:
# Both indexing functions work with slices in addition to single labels or lists of labels
print(df.loc[:'Utah', 'two'])

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64


In [57]:
print(df.iloc[:, :3])
print(df.iloc[:, :3][df.three > 5])
print(df[df.three > 5].iloc[:, :3])

          one  two  three
Ohio        0    0      0
Colorado    0    5      6
Utah        8    9     10
New York   12   13     14
          one  two  three
Colorado    0    5      6
Utah        8    9     10
New York   12   13     14
          one  two  three
Colorado    0    5      6
Utah        8    9     10
New York   12   13     14


In [66]:
print(df.iloc[1])
print(df.iloc[:, 1])
print(df.iloc[1, 1])
# print(df.at[1,1]) # error
print(df.at['Colorado', 'two'])
print(df.iat[1,1])

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64
Ohio         0
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
5
5
5


### Arithmetic and Data Alignment


In [69]:
# When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs. For users with database experience, this is similar to an automatic outer join on the index labels. Let’s look at an example
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])
print(s1)
print(s2)
print(s1 + s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64


In [70]:
# In the case of DataFrame, alignment is performed on both the rows and the columns:
df1 = pd.DataFrame(np.arange(9.).reshape(3,3), 
                   columns = list('bcd'),
                   index = ['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape(4,3), 
                   columns = list('bde'),
                   index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
print(df1 + df2)


            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


In [73]:
# If you add DataFrame objects with no column or row labels in common, the result will contain all nulls
df1 = pd.DataFrame({'A' : [1,2]})
df2 = pd.DataFrame({'B' : [3,4]})
print(df1)
print(df2)
print(df1 + df2)

   A
0  1
1  2
   B
0  3
1  4
    A   B
0 NaN NaN
1 NaN NaN


### Arithmetic methods with fill values

In [85]:
# In arithmetic operations between differently indexed objects, you might want to fill with a special value, like 0, when an axis label is found in one object but not the other
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df1.loc[1, 'b'] = 33333
df1.loc[1, 'b'] = np.nan
print(df1)
print(df2)
print(df1 + df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  NaN   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [86]:
# Using the add method on df1, I pass df2 and an argument to fill_value
print(df1.add(df2, fill_value=0))

      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   6.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [91]:
print(1/df1)
print(df1.rdiv(1))


          a         b         c         d
0       inf  1.000000  0.500000  0.333333
1  0.250000       NaN  0.166667  0.142857
2  0.125000  0.111111  0.100000  0.090909
          a         b         c         d
0       inf  1.000000  0.500000  0.333333
1  0.250000       NaN  0.166667  0.142857
2  0.125000  0.111111  0.100000  0.090909


### Function Application and Mapping

In [94]:
# NumPy ufuncs (element-wise array methods) also work with pandas objects
df = pd.DataFrame(np.random.randn(4,3), columns = list('bde'),
                 index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df)
print(np.abs(df))

               b         d         e
Utah    0.021661  0.986661 -0.373745
Ohio   -2.522513  0.711161 -0.149265
Texas  -1.480966  0.425259 -0.451756
Oregon -2.114517  1.702210 -0.789529
               b         d         e
Utah    0.021661  0.986661  0.373745
Ohio    2.522513  0.711161  0.149265
Texas   1.480966  0.425259  0.451756
Oregon  2.114517  1.702210  0.789529


In [107]:
# Another frequent operation is applying a function on one-dimensional arrays to each column or row. DataFrame’s apply method does exactly this
# apply fun on columns
fun = lambda x: x.max() - x.min()
print(df.apply(fun)) # the same: df.apply(fun, axis = 'rows')
#print(df.apply(fun).reset_index())
print(df.apply(lambda x: x.min()))
print(df.apply(lambda x: x.max()))

b    2.544174
d    1.276951
e    0.640264
dtype: float64
b    2.544174
d    1.276951
e    0.640264
dtype: float64
b   -2.522513
d    0.425259
e   -0.789529
dtype: float64
b    0.021661
d    1.702210
e   -0.149265
dtype: float64


In [100]:
# apply fun on rows
print(df.apply(fun, axis = 'columns'))

Utah      1.360406
Ohio      3.233674
Texas     1.906225
Oregon    3.816726
dtype: float64


In [110]:
# Many of the most common array statistics (like sum and mean) are DataFrame meth‐ ods, so using apply is not necessary
print(df)
print(df.sum())
print(df.apply(lambda x: sum(x)))

               b         d         e
Utah    0.021661  0.986661 -0.373745
Ohio   -2.522513  0.711161 -0.149265
Texas  -1.480966  0.425259 -0.451756
Oregon -2.114517  1.702210 -0.789529
b   -6.096335
d    3.825291
e   -1.764295
dtype: float64
b   -6.096335
d    3.825291
e   -1.764295
dtype: float64


In [114]:
# The function passed to apply need not return a scalar value; it can also return a Series with multiple values
def fun(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])
print(df.apply(fun))
print(df.apply(lambda x: pd.Series([x.min(), x.max()], index=['min', 'max'])))


            b         d         e
min -2.522513  0.425259 -0.789529
max  0.021661  1.702210 -0.149265
            b         d         e
min -2.522513  0.425259 -0.789529
max  0.021661  1.702210 -0.149265


In [117]:
# Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating-point value in frame. You can do this with apply map:
format = lambda x: '%.2f' % x
print(df.applymap(format))
print(df.applymap(lambda x: -x))

            b     d      e
Utah     0.02  0.99  -0.37
Ohio    -2.52  0.71  -0.15
Texas   -1.48  0.43  -0.45
Oregon  -2.11  1.70  -0.79
               b         d         e
Utah   -0.021661 -0.986661  0.373745
Ohio    2.522513 -0.711161  0.149265
Texas   1.480966 -0.425259  0.451756
Oregon  2.114517 -1.702210  0.789529


In [124]:
# The reason for the name applymap is that Series has a map method for applying an element-wise function
print(type(df['e']))
print(df['e'])
print(df['e'].map(format))

<class 'pandas.core.series.Series'>
Utah     -0.373745
Ohio     -0.149265
Texas    -0.451756
Oregon   -0.789529
Name: e, dtype: float64
Utah      -0.37
Ohio      -0.15
Texas     -0.45
Oregon    -0.79
Name: e, dtype: object


### Sorting and Ranking


In [128]:
# Sort by index
# To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object
obj = pd.Series(range(4), index=['d', 'a', 'c', 'b'])
print(obj)
print(obj.sort_index())

d    0
a    1
c    2
b    3
dtype: int64
a    1
b    3
c    2
d    0
dtype: int64


In [133]:
# With a DataFrame, you can sort by index on either axis
df = pd.DataFrame(np.arange(8).reshape((2, 4)), 
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(df)
print(df.sort_index())
print(df.sort_index(axis=1)) # the same df.sort_index(axis='columns')
print(df.sort_index(axis='columns', ascending = False))

       d  a  b  c
three  0  1  2  3
one    4  5  6  7
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [135]:
# Sort by values
obj = pd.Series([4, 7, -3, 2])
print(obj)
print(obj.sort_values())

0    4
1    7
2   -3
3    2
dtype: int64
2   -3
3    2
0    4
1    7
dtype: int64


In [136]:
# Any missing values are sorted to the end of the Series by default:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
print(obj.sort_values())

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64


In [148]:
# When sorting a DataFrame, you can use the data in one or more columns as the sort keys. To do so, pass one or more column names to the by option of sort_values
df = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(df)
print(df.sort_values(by = 'b'))
print(df.sort_values(by = 'b').reset_index().drop('index', axis = 'columns')) # ordered indexes

   a  b
0  0  4
1  1  7
2  0 -3
3  1  2
   a  b
2  0 -3
3  1  2
0  0  4
1  1  7
   a  b
0  0 -3
1  1  2
2  0  4
3  1  7


In [157]:
# Ranking
# Ranking assigns ranks from one through the number of valid data points in an array. by default rank breaks ties by assigning each group the mean rank
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj)
print('***obj.rank()', '\n', obj.rank())
print(obj.mean())
#print(obj.map(lambda x: (x - obj.min()) * (obj.max() - obj.min()) / len(obj)))
# Ranks can also be assigned according to the order in which they’re observed in the data:
print("***obj.rank(method='first')", '\n', obj.rank(method='first'))
# You can rank in descending order, too
# Assign tie values the maximum rank in the group
print("***obj.rank(ascending=False, method='max')", '\n', obj.rank(ascending=False, method='max'))


0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
***obj.rank() 
 0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
2.7142857142857144
***obj.rank(method='first') 
 0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
***obj.rank(ascending=False, method='max') 
 0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64


In [159]:
# DataFrame can compute ranks over the rows or the columns
df = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
print(df)
print(df.rank(axis = 'columns'))

   a    b    c
0  0  4.3 -2.0
1  1  7.0  5.0
2  0 -3.0  8.0
3  1  2.0 -2.5
     a    b    c
0  2.0  3.0  1.0
1  1.0  3.0  2.0
2  2.0  1.0  3.0
3  2.0  3.0  1.0


### Unique Values, Value Counts, and Membership

In [3]:
import pandas as pd
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj)

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object


In [4]:
# unique values
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [5]:
# value counts
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [6]:
pd.value_counts(obj)

c    3
a    3
b    2
d    1
dtype: int64

In [15]:
# value counts in several columns
# In some cases, you may want to compute a histogram on multiple related columns in a DataFrame
data = pd.DataFrame({'Qu1': [1,3,4,3,4],
                     'Qu2': [2,3,1,2,3],
                     'Qu3': [1,5,2,4,4]})
print(data)
result = data.apply(pd.value_counts).fillna(0)
print(result)

   Qu1  Qu2  Qu3
0    1    2    1
1    3    3    5
2    4    1    2
3    3    2    4
4    4    3    4
   Qu1  Qu2  Qu3
1  1.0  1.0  1.0
2  0.0  2.0  1.0
3  2.0  2.0  0.0
4  2.0  0.0  2.0
5  0.0  0.0  1.0


In [10]:
# membership check
print(obj)
print(obj.isin(['a', 'b']))
b = obj.isin(['a', 'b'])
print(obj[b])

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object
0    False
1     True
2    False
3     True
4     True
5     True
6     True
7    False
8    False
dtype: bool
1    a
3    a
4    a
5    b
6    b
dtype: object


In [12]:
# Related to isin is the Index.get_indexer method, which gives you an index array from an array of possibly non-distinct values into another array of distinct values
to_match = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([ 0,  2, -1,  2,  2,  1,  1,  0,  0])