# Pandas

- data structures and data manipulation tools designed to make data cleaning and anlysis fast and easy
- used in tandem with
    - numerical computing tools like NumPy and SciPy
    - analytical libraries like statsmodels and scikit-learn
    - data visualization libraries like matplotlib
- adopts significant parts of NumPy's idomatic style of array-based computing, especially vectorizations
- designed for working with tabular or heterogenous data, wheras NumPy is best suited for working with homogeneous numerical array data


# 5.1 Intro to Pandas Data Structures

- `Series` and `DataFrame`

## Series

- a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its `index`
    - default index is integer 0 through N - 1
- can be thought of as a fixed-length, ordered dict, as it is a mapping of index values to data values

In [2]:
import pandas as pd

obj = pd.Series([4, 7, -5, 3])
print(obj)
print("values: {}".format(obj.values))
print("index: {}".format(obj.index))

0    4
1    7
2   -5
3    3
dtype: int64
values: [ 4  7 -5  3]
index: RangeIndex(start=0, stop=4, step=1)


In [7]:
import pandas as pd

# Create a Series with an index identifying each data point with a label
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(obj2)
print("values: {}".format(obj2.values))
print("index: {}".format(obj2.index))
print("obj2['a']: {}".format(obj2['a']))
print("obj2['c', 'a', 'd']: {}".format(obj2[['c', 'a', 'd']]))

d    4
b    7
a   -5
c    3
dtype: int64
values: [ 4  7 -5  3]
index: Index(['d', 'b', 'a', 'c'], dtype='object')
obj2['a']: -5
obj2['c', 'a', 'd']: c    3
a   -5
d    4
dtype: int64


In [12]:
# NumPy-like function
import pandas as pd

obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2[obj2 > 0]  # filtering with a boolean array

# the index-value link will be preserved

d    4
b    7
c    3
dtype: int64

In [11]:
import pandas as pd

obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [13]:
# even with NumPy!
import pandas as pd
import numpy as np

obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [14]:
import pandas as pd

obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
'b' in obj2

True

In [15]:
# create a series with Python dict
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [17]:
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj4 = pd.Series(sdata, ['California', 'Ohio', 'Oregon', 'Texas'])
pd.isnull(obj4)  # or obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [18]:
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj4 = pd.Series(sdata, ['California', 'Ohio', 'Oregon', 'Texas'])
obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [20]:
# Series automatically aligns by index label in arithmetic operations
# similar to a join operation
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj4 = pd.Series(sdata, ['California', 'Ohio', 'Oregon', 'Texas'])
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [22]:
# Series object and its index have a name attribute
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3.name = 'population'
obj3.index.name = 'state'
obj3

state
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
Name: population, dtype: int64

## DataFrame

- represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type
- has both row and column index
- can be thought of as a dict of Series all sharing the same index
- stored as one or more two-dimensional blocks


In [24]:
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [25]:
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame.head()  # selects the first five rows

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [27]:
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
# specify a sequence of columns
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [30]:
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(data,
                      columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [31]:
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(data,
                      columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [33]:
# a column in a DataFrame can be retrieved as a Series
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(data,
                      columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2['state']  # or frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [34]:
# a row can be retrieved by position or name with the special loc attr
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(data,
                      columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [38]:
# columns can be modified by assignment
# assign a scalar value or an array of values
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(
    data,
    columns=['year', 'state', 'pop', 'debt'],
    index=['one', 'two', 'three', 'four', 'five', 'six']
)
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [39]:
# columns can be modified by assignment
# assign a scalar value or an array of values
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(
    data,
    columns=['year', 'state', 'pop', 'debt'],
    index=['one', 'two', 'three', 'four', 'five', 'six']
)
frame2['debt'] = np.arange(6)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [41]:
# assigning a column that doesnt exist create a new column
# columns can be modified by assignment
# assign a scalar value or an array of values
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(
    data,
    columns=['year', 'state', 'pop', 'debt'],
    index=['one', 'two', 'three', 'four', 'five', 'six']
)
frame2['eastern'] = frame2.state == 'Ohio'
frame2  # the new column can be deleted with `del frame2['eastern']`

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,,False


In [43]:
import pandas as pd
nested_dict = {'Nevada': {2001: 2.4, 2002: 2.9},
               'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(nested_dict)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [44]:
import pandas as pd
nested_dict = {'Nevada': {2001: 2.4, 2002: 2.9},
               'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(nested_dict)
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


## Index Objects

- holds the axis labels and other metadata, like axis names
- immutable

In [45]:
import pandas as pd
nested_dict = {'Nevada': {2001: 2.4, 2002: 2.9},
               'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(nested_dict)
frame3.index

Int64Index([2001, 2002, 2000], dtype='int64')

# 5.2 Essential Functionality

## Reindexing

- to create a new object with the data *conformed* to a new index
- calling `reindex` on a Series rearranges the data according to the new index, introducing missing values if any index values were not already present


In [48]:
import pandas as pd
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj.reindex(['a', 'b', 'c', 'd', 'e'])

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

- for ordered data like time series, it may be desirable to do some interpolation or filling or values when reindexing
    - the `method` option: e.g., `ffill` forward-fills
   

In [49]:
import pandas as pd
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

- with DataFrame, reindex can alter either the row index, columns, or both


In [50]:
import pandas as pd
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame.reindex(['a', 'b', 'c', 'd'])  # reindex rows

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [51]:
import pandas as pd
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame.reindex(columns=['Texas', 'Utah', 'California'])  # reindex columns

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


## Dropping Entries from an Axis

In [3]:
import numpy as np
import pandas as pd

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [6]:
import pandas as pd

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
# takes an optional inplace kwarg
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [5]:
import pandas as pd

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


## Indexing, Selection, and Filtering



In [31]:
import pandas as pd
import importlib
from common import eval_and_print

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
eval_and_print("obj['b']", locals())
eval_and_print("obj[1]", locals())
eval_and_print("obj[2:4]", locals())
eval_and_print("obj[['b', 'a', 'd']]", locals())
eval_and_print("obj[[1, 3]]", locals())
eval_and_print("obj[obj < 2]", locals())

obj['b']:
1.0
obj[1]:
1.0
obj[2:4]:
c    2.0
d    3.0
dtype: float64
obj[['b', 'a', 'd']]:
b    1.0
a    0.0
d    3.0
dtype: float64
obj[[1, 3]]:
b    1.0
d    3.0
dtype: float64
obj[obj < 2]:
a    0.0
b    1.0
dtype: float64


In [36]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
eval_and_print("data['two']", locals())
eval_and_print("data[['two', 'three']]", locals())
eval_and_print("data[:2]", locals())
# indexing with a boolean DataFrame
eval_and_print("data[data['three'] > 5]", locals())

data['two']:
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
data[['two', 'three']]:
          two  three
Ohio        1      2
Colorado    5      6
Utah        9     10
New York   13     14
data[:2]:
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
data[data['three'] > 5]:
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


### Selection with `loc` and `iloc`

- `loc`: select a subset of rows and columns from a DataFrame using axis labels
- `iloc`: similar to `loc` but use integers


In [41]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
eval_and_print("data.loc['Colorado', ['two', 'three']]", locals())
eval_and_print("data.iloc[[1, 2], [3, 0, 1]]", locals())

data.loc['Colorado', ['two', 'three']]:
two      5
three    6
Name: Colorado, dtype: int64
data.iloc[[1, 2], [3, 0, 1]]:
          four  one  two
Colorado     7    4    5
Utah        11    8    9


## Arithmetic and Data Alignment

- for binary operators, if any index pairs are not the same, the respective index in the result will be the union of the index pairs
- the internal data alignment introduces missing values in the label locations that dont overlap; missing values will then propagate in further arithmetic computations


In [42]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

- in the case of DataFrame, alignment is performed on both the rows and the columns

In [43]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)),
                   columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                   columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [44]:
# Arithmetic methods with fill values
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


## Function Application and Mapping

- NumPy ufuncs (element-wise array methods) also work with pandas objects


In [45]:
frame = pd.DataFrame(np.random.randn(4, 3),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.296049,2.182684,1.266997
Ohio,0.697549,1.414571,0.235699
Texas,1.081076,0.104223,1.890387
Oregon,1.06087,0.675649,0.965413


- you can also apply a function on one-dimensional arrays to each column or row, using the `apply` method


In [46]:
f = lambda x: x.max() - x.min()

eval_and_print("frame.apply(f)", locals())
eval_and_print("frame.apply(f, axis='columns')", locals())

frame.apply(f):
b    0.598500
d    2.858333
e    2.855800
dtype: float64
frame.apply(f, axis='columns'):
Utah      3.449681
Ohio      1.178872
Texas     2.971463
Oregon    1.736519
dtype: float64


- the functioin passed to `apply` need not to return a scalar value; it can also return a Series with multiple values


In [47]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.296049,-2.182684,-0.965413
max,-0.697549,0.675649,1.890387


- elementary-wise functions can also be applied
- e.g., compute a formatted string from each floating-point value in frame


In [48]:
frame.applymap(lambda x: '%.2f' % x)

Unnamed: 0,b,d,e
Utah,-1.3,-2.18,1.27
Ohio,-0.7,-1.41,-0.24
Texas,-1.08,-0.1,1.89
Oregon,-1.06,0.68,-0.97


## Sorting and Ranking



In [49]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [51]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
eval_and_print("frame.sort_index()", locals())
eval_and_print("frame.sort_index(axis=1)", locals())

frame.sort_index():
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
frame.sort_index(axis=1):
       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [52]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [53]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [54]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


# 5.3 Summarizing and Computing Descriptive Statistics

- `reductions` or `summary statistics`: methods that extract a single value from a Series or a Series of values from the rows/columns of a DataFrame
- builtin support for handling missing data


In [56]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
                 index=['a', 'b', 'c', 'd'], columns=['one', 'two'])

df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [57]:
df.sum()  # return a Series containing column sums

one    9.25
two   -5.80
dtype: float64

In [58]:
df.sum(axis='columns')  # sums across the columns instead

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [59]:
# NA values are excluded by default, skipna disables such behavior
df.mean(axis='columns', skipna=False) 

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [60]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## Correlation and Covariance

- correlation and covariance are computed from pairs of argument


In [74]:
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker) 
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})

In [65]:
returns = price.pct_change()

In [66]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-02-24,-0.0475,-0.022758,-0.043115,-0.042771
2020-02-25,-0.033872,-0.032234,-0.016502,-0.023312
2020-02-26,0.015864,-0.013831,0.012495,0.003407
2020-02-27,-0.065368,-0.047513,-0.070459,-0.053898
2020-02-28,-0.000585,-0.022237,0.024213,0.016114


In [67]:
# The corr method of Series computes the correlation of the overlapping,
# non-NA, aligned-by-index values in two Series
returns['MSFT'].corr(returns['IBM'])

0.47458605729852554

In [68]:
# cov computes the covariance
returns['MSFT'].cov(returns['IBM'])

9.168613365295116e-05

In [75]:
# pandas magic: MSFT is a valid attribute on DF
returns.MSFT.corr(returns.IBM)

0.47458605729852554

In [76]:
# The corr or cov methods of the DataFrame instead return a full 
# correlation or covariance matrix as a DataFrame
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.400828,0.595497,0.547329
IBM,0.400828,1.0,0.474586,0.416022
MSFT,0.595497,0.474586,1.0,0.681318
GOOG,0.547329,0.416022,0.681318,1.0


In [77]:
# corrwith computes pairwise correlations between a DataFrame's columns
# and rows with another Series or DataFrame

# passing a Series returns a Series with the
# correlation value computed for each column
returns.corrwith(returns.IBM)

AAPL    0.400828
IBM     1.000000
MSFT    0.474586
GOOG    0.416022
dtype: float64

In [79]:
# passing a DataFrame computes the correlations of matching column names
returns.corrwith(volume)

AAPL   -0.161841
IBM    -0.094543
MSFT   -0.051381
GOOG   -0.026159
dtype: float64

## Unique Values, Value Counts, and Membership

- extracts information about the values contained in a one-dimensional Series


In [80]:
import pandas as pd
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [82]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [84]:
obj.isin(['b', 'c']) # vectorized set membership check

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool