In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Series

## value and index

In [2]:
obj = pd.Series([3, 7, -5, 3])

In [3]:
obj

0    3
1    7
2   -5
3    3
dtype: int64

In [5]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [6]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [7]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [8]:
obj2['a']

-5

In [9]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

Using NumPy functions or NumPy-like operations, such as filtering with a boolean array, scalar multiplication, or applying math functions, will preserve the index-value link:

In [10]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [11]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [14]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [15]:
'b' in obj2

True

## Create a Series from a Python dict:

In [16]:
sdata = {'Ohio': 35000, 'Texas':71000, 'Oregon': 16000, 'Utah': 5000}

In [17]:
obj3 = pd.Series(sdata)

In [18]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [20]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [21]:
obj4 = pd.Series(sdata, index=states)

In [22]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

Here, three values found in sdata were placed in the appropriate locations, but since no value for 'California' was found, it appears as NaN (not a number), which is considered in pandas to mark missing or NA values. Since 'Utah' was not included in states, it is excluded from the resulting object.

## detect missing values

In [23]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [24]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [25]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [26]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

## name attribute

In [27]:
obj4.name = 'population'

In [28]:
obj4.index.name = 'state'

In [29]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

## alter index

In [30]:
obj

0    3
1    7
2   -5
3    3
dtype: int64

In [31]:
obj.index = ['Bob','Steve', 'Jeff', 'Ryan']

In [32]:
obj

Bob      3
Steve    7
Jeff    -5
Ryan     3
dtype: int64

# Data Frame

## create from dict

In [17]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [39]:
frame = pd.DataFrame(data)

In [40]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [41]:
frame.head()    # show first 5 rows

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [18]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])

In [45]:
frame2    # debt field is NA because it didn't exist in the dict

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [6]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


### specify a sequence of columns

In [42]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


### retrive a column or row

In [46]:
frame2['state']    # by dict-like notation

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [48]:
frame2.year    # by attribute

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [49]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

## modify columns

### assign values

In [50]:
frame2['debt'] = 16.5

In [51]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [52]:
frame2['debt'] = np.arange(6.)

In [53]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [54]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [55]:
frame2['debt'] = val    # series的index 会与 frame的index 一一对应

In [56]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


### create a new colume

In [57]:
frame2['eastern'] = frame2.state == 'Ohio'

In [59]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [60]:
del frame2['eastern']

In [62]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

## create from dict of dict

In [1]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [5]:
frame3 = pd.DataFrame(pop)

In [8]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [7]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [10]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}

In [11]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


If a DataFrame’s index and columns have their name attributes set, these will also be displayed:

In [12]:
frame3.index.name = 'year'; frame3.columns.name = 'state'

In [13]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [14]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

If the DataFrame’s columns are different dtypes, the dtype of the values array will be chosen to accommodate all of the columns:

In [19]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

Possible data inputs to DataFrame constructor:

![df_type](https://github.com/Zlisu/Notes/blob/master/Images/NumPy/data_type_for_df.PNG?raw=True)

## transpose

In [9]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


# Index Objects

In [20]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

In [21]:
index = obj.index

In [22]:
index

Index(['a', 'b', 'c'], dtype='object')

In [23]:
index[1:]

Index(['b', 'c'], dtype='object')

**Immutability** makes it safer to share Index objects among data structures:

In [24]:
labels = pd.Index(np.arange(3))

In [25]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [26]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)

In [27]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [28]:
obj2.index is labels

True

In addition to being array-like, an Index also behaves like a fixed-size set:

In [29]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [30]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [31]:
'Ohio' in frame3.columns

True

In [32]:
2003 in frame3.columns

False

Unlike Python sets, a pandas Index can contain duplicate labels:

In [33]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])

In [34]:
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

**Index methods:**

![index](https://github.com/Zlisu/Notes/blob/master/Images/NumPy/index_methods.PNG?raw=True)

# Essential Functions

## re-indexing

### series

In [7]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])

In [8]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [9]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [10]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [11]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [12]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [13]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

### data frame

With DataFrame, reindex can alter either the (row) index, columns, or both. When passed only a sequence, it reindexes the rows in the result:

In [14]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])

In [15]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [16]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [17]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [18]:
states = ['Texas', 'Utah', 'California']

In [19]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [20]:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


Re-indexing 只是给原有的 columns 调换顺序，并不会给原有的 columns 改名。

![re_index](https://github.com/Zlisu/Notes/blob/master/Images/NumPy/re_indexing.PNG?raw=True)

## Dropping entries from an axis

### series

In [3]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [4]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [5]:
new_obj = obj.drop('c')

In [6]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [8]:
obj.drop(['d', 'e'])

a    0.0
b    1.0
c    2.0
dtype: float64

### data frame

With DataFrame, index values can be deleted from either axis.

#### drop rows

In [9]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [10]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [11]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


#### drop columns

In [13]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [14]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


#### manipulate on objects in-place without returning a new object

In [15]:
obj.drop('c', inplace=True)

In [16]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

## indexing

### series

In [17]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [18]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [19]:
obj['b']

1.0

In [20]:
obj[1]

1.0

In [21]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [22]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [23]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [24]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [25]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [26]:
obj['b':'c'] = 5

In [27]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

### data frame

In [28]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [29]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [30]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [31]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


#### with boolean array

In [32]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [37]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### with a boolean DataFrame

In [40]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [41]:
data[data < 5] = 0

In [42]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### selection with loc and iloc

select a single row and mltiple columns by label:

In [44]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [45]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [46]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [47]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


![df_index](https://github.com/Zlisu/Notes/blob/master/Images/NumPy/indexing_df.png?raw=True)

## integer indexing

In [48]:
ser = pd.Series(np.arange(3.))

In [55]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [56]:
ser[-1]    # will raise error

KeyError: -1

In [52]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

In [54]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [53]:
ser2[-1]

2.0

To keep things consistent, if you have an axis index containing integers, data selection will always be label-oriented. For more precise handling, use loc (for labels) or iloc (for integers):

In [58]:
ser[:1]

0    0.0
dtype: float64

In [59]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [60]:
ser.iloc[:1]

0    0.0
dtype: float64

## Arithmetic and Data Alignment

When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs. For users with database experience, this is similar to an automatic outer join on the index labels.

In [61]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

In [62]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [63]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [64]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [65]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [66]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])

In [67]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [68]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [69]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [70]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


If you add DataFrame objects with no column or row labels in common, the result will contain all nulls:

In [71]:
df1 = pd.DataFrame({'A': [1, 2]})

In [72]:
df2 = pd.DataFrame({'B': [3, 4]})

In [73]:
df1

Unnamed: 0,A
0,1
1,2


In [74]:
df2

Unnamed: 0,B
0,3
1,4


In [75]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


In [76]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


### Arithmetic methods with fill values

In [78]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))

In [82]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))

In [83]:
df2.loc[1, 'b'] = np.nan

In [84]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [85]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [86]:
df1 + df2    # 直接相加，没有overlap的就会使NA

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [88]:
df1.add(df2, fill_value=0)    # 没有overlap的，会以0代替，再相加

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value:

In [89]:
df1.reindex(columns=df2.columns)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,
1,4.0,5.0,6.0,7.0,
2,8.0,9.0,10.0,11.0,


In [90]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


![arithmetic](https://github.com/Zlisu/Notes/blob/master/Images/NumPy/flexible_arithmetic_methods.png?raw=True)

### Operations between DataFrame and Series

In [91]:
arr = np.arange(12.).reshape((3, 4))

In [92]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [93]:
arr[0]

array([0., 1., 2., 3.])

In [95]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

When we subtract arr[0] from arr, the subtraction is performed once for each row. This is referred to as broadcasting.

Operations between a DataFrame and a Series are similar:

In [101]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [97]:
series = frame.iloc[0]

In [98]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [99]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame’s columns, broadcasting down the rows:

In [102]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [103]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])

In [104]:
series2

b    0
e    1
f    2
dtype: int64

In [105]:
frame + series2    

# frame 的 column b 每个元素 + 0
# frame 的 column e 每个元素 + 1
# seriess 没有 column d; frame 没有 column f，相加的结果这两列都是NaN


Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


If you want to instead broadcast over the columns, matching on the rows, you have to use one of the arithmetic methods. For example:

In [106]:
series3 = frame['d']

In [107]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [108]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [111]:
# 前面的例子都是逐行加／减
# 这个例子是逐列

frame.sub(series3, axis='index')    

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


## Function Application and Mapping

In [113]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [114]:
frame

Unnamed: 0,b,d,e
Utah,-0.881932,-0.088215,-0.575397
Ohio,0.538518,-0.970617,-0.26176
Texas,1.236432,-2.835818,0.108638
Oregon,-1.609298,0.153968,-0.230825


In [115]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.881932,0.088215,0.575397
Ohio,0.538518,0.970617,0.26176
Texas,1.236432,2.835818,0.108638
Oregon,1.609298,0.153968,0.230825


Another frequent operation is applying a function on one-dimensional arrays to each column or row. DataFrame’s apply method does exactly this:

In [118]:
f = lambda x: x.max() - x.min()

In [119]:
frame.apply(f)

b    2.845730
d    2.989786
e    0.684035
dtype: float64

The function passed to apply need not return a scalar value; it can also return a Series with multiple values:

In [120]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [121]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.609298,-2.835818,-0.575397
max,1.236432,0.153968,0.108638


In [122]:
format = lambda x: '%.2f' % x

In [123]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.88,-0.09,-0.58
Ohio,0.54,-0.97,-0.26
Texas,1.24,-2.84,0.11
Oregon,-1.61,0.15,-0.23


The reason for the name applymap is that Series has a map method for applying an element-wise function:

In [124]:
frame['e'].map(format)

Utah      -0.58
Ohio      -0.26
Texas      0.11
Oregon    -0.23
Name: e, dtype: object

## sorting and ranking

### sort by index

In [125]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [127]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

With a DataFrame, you can sort by index on either axis:

In [128]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), 
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])

In [131]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [130]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [132]:
obj = pd.Series([4, 7, -3, 2])

### sort by value

In [133]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

Any missing values are sorted to the end of the Series by default:

In [134]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [135]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

 ### by multiple keys

In [136]:
frame = pd.DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})

In [137]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [138]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [139]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


### ranking

#### series

In [140]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [141]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [142]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Here, instead of using the average rank 6.5 for the entries 0 and 2, they instead have been set to 6 and 7 because label 0 precedes label 2 in the data.

#### data frame

In [143]:
frame = pd.DataFrame({'b':[4.3, 7, -3, 2], 
                      'a':[0, 1, 0, 1],
                      'c':[-2, 5, 8, -2.5]})

In [144]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [145]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


#### Tie-breaking methods with rank

![arithmetic](https://github.com/Zlisu/Notes/blob/master/Images/NumPy/ranking.png?raw=True)

## Axis indexes with duplicate labels

In [146]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [147]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [148]:
obj.index.is_unique

False

In [150]:
obj['a']    # return a Series

a    0
a    1
dtype: int64

### duplicate row labels

### Series with duplicate indices:

In [151]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [152]:
df

Unnamed: 0,0,1,2
a,0.621675,-0.180086,0.07646
a,-1.744977,0.46013,0.066673
b,-1.227784,0.422802,-0.241038
b,-1.834825,0.579501,1.590221


In [153]:
df.loc['b']

Unnamed: 0,0,1,2
b,-1.227784,0.422802,-0.241038
b,-1.834825,0.579501,1.590221


# Summarizing and Computing Descriptive Statistics

sum()

In [154]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                  [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [155]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [156]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [157]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

NA values are excluded unless the entire slice (row or column in this case) is NA. This can be disabled with the skipna option:

mean()

In [158]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

idxmin() / idxmas()

Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:

In [159]:
df.idxmax()

one    b
two    d
dtype: object

In [160]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


Another type of method is neither a reduction nor an accumulation:

In [162]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


On non-numeric data, describe produces alternative summary statistics:

In [163]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)

In [166]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object