# 1. Numpy

In [1]:
import numpy as np

## 1.1 ndarray

In [15]:
type(np.array(1))

numpy.ndarray

In [16]:
np.array([1,2,3,4,5])

array([1, 2, 3, 4, 5])

In [17]:
arr = np.arange(1000)

In [18]:
arr = np.array([[1,2,3],[4,5,6]])

In [19]:
arr.ndim

2

In [20]:
arr.shape

(2, 3)

In [21]:
arr.dtype

dtype('int64')

In [22]:
arr = np.array([1,2,3], dtype = np.float64)

In [23]:
arr.dtype

dtype('float64')

In [24]:
arr.astype(np.int32)

array([1, 2, 3], dtype=int32)

## 1.2 Arithmetic

### Vectorization

In [30]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [31]:
arr * arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [32]:
arr * 2

array([[ 2.,  4.,  6.],
       [ 8., 10., 12.]])

In [33]:
arr == arr

array([[ True,  True,  True],
       [ True,  True,  True]])

### Transposing and swapping

In [90]:
arr = np.arange(15).reshape(3,5)
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [91]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [92]:
np.dot(arr, arr.T)

array([[ 30,  80, 130],
       [ 80, 255, 430],
       [130, 430, 730]])

In [96]:
arr = np.arange(24).reshape((2,3,4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [97]:
arr.T.shape

(4, 3, 2)

In [100]:
arr.transpose(0,1,2)

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [101]:
arr.transpose(1,0,2)

array([[[ 0,  1,  2,  3],
        [12, 13, 14, 15]],

       [[ 4,  5,  6,  7],
        [16, 17, 18, 19]],

       [[ 8,  9, 10, 11],
        [20, 21, 22, 23]]])

In [102]:
arr.swapaxes(1,2)

array([[[ 0,  4,  8],
        [ 1,  5,  9],
        [ 2,  6, 10],
        [ 3,  7, 11]],

       [[12, 16, 20],
        [13, 17, 21],
        [14, 18, 22],
        [15, 19, 23]]])

## 1.3 Indexing and slicing

In [34]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [35]:
arr[5]

5

In [37]:
arr[5:8]

array([5, 6, 7])

In [38]:
arr[5:8].copy()

array([5, 6, 7])

In [39]:
arr = np.array([[1,2,3],
                [4,5,6],
                [7,8,9]])
arr

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [43]:
arr.ndim

2

In [44]:
arr.shape

(3, 3)

In [41]:
arr[0][2]

3

In [45]:
arr[0,2]

3

In [46]:
arr = np.array([[[1, 2, 3],[4, 5, 6]],
                [[7, 8, 9],[10, 11, 12]]])
arr

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [47]:
arr.ndim

3

In [48]:
arr.shape

(2, 2, 3)

In [49]:
arr[0][0][1]

2

In [50]:
arr[0,0,1]

2

### Boolean indexing

In [56]:
arr = np.array([1,2,3,4,5,6,7,8])

In [58]:
arr[arr > 4]

array([5, 6, 7, 8])

In [65]:
arr != 4

array([ True,  True,  True, False,  True,  True,  True,  True])

In [70]:
arr[(arr > 4) & (arr < 8)]

array([5, 6, 7])

### Fancy indexing

In [76]:
arr = np.arange(32).reshape(8,4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [87]:
# Rows
arr[[4, 3, 0, 6],:]

array([[16, 17, 18, 19],
       [12, 13, 14, 15],
       [ 0,  1,  2,  3],
       [24, 25, 26, 27]])

In [89]:
# Columns
arr[:,[2, 0]]

array([[ 2,  0],
       [ 6,  4],
       [10,  8],
       [14, 12],
       [18, 16],
       [22, 20],
       [26, 24],
       [30, 28]])

In [86]:
arr[[4, 3, 0, 6],[0, 3, 2, 1]]

array([16, 15,  2, 25])

## 1.4 Universal functions

In [103]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [104]:
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [108]:
list(map(np.sqrt, arr))

[0.0,
 1.0,
 1.4142135623730951,
 1.7320508075688772,
 2.0,
 2.23606797749979,
 2.449489742783178,
 2.6457513110645907,
 2.8284271247461903,
 3.0]

In [139]:
arr1 = np.array([1,2,3,4,5])
arr2 = np.array([6,5,4,3,2])

In [140]:
np.maximum(arr1, arr2)

array([6, 5, 4, 4, 5])

In [183]:
np.argmax(arr1)

4

## 1.5 Array-oriented programming

### Conditional logic

In [149]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

In [150]:
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
result

[1.1, 2.2, 1.3, 1.4, 2.5]

In [152]:
result = np.where(cond, xarr, yarr)
result

array([1.1, 2.2, 1.3, 1.4, 2.5])

### Statistics

In [153]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [154]:
arr.mean()

4.5

In [155]:
arr.sum()

45

In [156]:
arr = np.arange(20).reshape(5,4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [157]:
arr.mean(axis = 0)

array([ 8.,  9., 10., 11.])

In [158]:
arr.mean(axis = 1)

array([ 1.5,  5.5,  9.5, 13.5, 17.5])

In [159]:
arr.cumsum()

array([  0,   1,   3,   6,  10,  15,  21,  28,  36,  45,  55,  66,  78,
        91, 105, 120, 136, 153, 171, 190])

### Booleans

In [160]:
arr = np.random.randn(100)

In [162]:
(arr > 0).sum()

48

In [163]:
np.array([False,False,True,False]).any()

True

In [164]:
np.array([False,False,True,False]).all()

False

### Sorting

In [165]:
arr = np.random.randn(6)
arr

array([ 2.71849466,  1.0323729 , -0.74596696, -1.10717886,  1.21061929,
        0.72726658])

In [167]:
arr.sort()
arr

array([-1.10717886, -0.74596696,  0.72726658,  1.0323729 ,  1.21061929,
        2.71849466])

In [168]:
np.sort(arr)

array([-1.10717886, -0.74596696,  0.72726658,  1.0323729 ,  1.21061929,
        2.71849466])

In [169]:
arr = np.random.randn(5,3)
arr

array([[-0.13035922,  1.07102387,  0.75975714],
       [ 1.52743206, -0.02695171, -0.04758158],
       [ 0.63477982,  0.10965363,  2.09150578],
       [ 0.74387277, -2.05842007,  1.62617507],
       [ 0.78436393, -1.00754957, -0.73254075]])

In [170]:
arr.sort(axis = 1)
arr

array([[-0.13035922,  0.75975714,  1.07102387],
       [-0.04758158, -0.02695171,  1.52743206],
       [ 0.10965363,  0.63477982,  2.09150578],
       [-2.05842007,  0.74387277,  1.62617507],
       [-1.00754957, -0.73254075,  0.78436393]])

In [171]:
arr.sort(axis = 0)
arr

array([[-2.05842007, -0.73254075,  0.78436393],
       [-1.00754957, -0.02695171,  1.07102387],
       [-0.13035922,  0.63477982,  1.52743206],
       [-0.04758158,  0.74387277,  1.62617507],
       [ 0.10965363,  0.75975714,  2.09150578]])

### Set logic

In [172]:
names = np.array(['Bob','Joe','Will','Bob','Will','Joe','Joe'])
np.unique(names)

array(['Bob', 'Joe', 'Will'], dtype='<U4')

In [173]:
values = np.array([6,0,0,3,2,5,6])
np.in1d(values, [2, 3, 6])

array([ True, False, False,  True,  True, False,  True])

### Broadcasting

In [280]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [281]:
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

## 1.6 File input and output

In [174]:
arr = np.arange(10)
np.save('some_array',arr)

In [175]:
np.load('some_array.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## 1.7 Linear algebra

In [176]:
x = np.array([[1.,2.,3.],[4.,5.,6.]])
y = np.array([[6., 23.],[-1, 7],[8,9]])

In [177]:
np.dot(x, y)

array([[ 28.,  64.],
       [ 67., 181.]])

In [178]:
x @ y

array([[ 28.,  64.],
       [ 67., 181.]])

## 1.8 Randomness

In [179]:
np.random.seed(1)

In [180]:
np.random.RandomState(1)

RandomState(MT19937) at 0x7F8D304C4340

# 2. Pandas

In [184]:
import pandas as pd

## 2.1 Series

In [185]:
ser = pd.Series([1, 2, 3, 4, 5])
ser

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [186]:
type(ser)

pandas.core.series.Series

In [187]:
ser.values

array([1, 2, 3, 4, 5])

In [188]:
ser.index

RangeIndex(start=0, stop=5, step=1)

In [189]:
ser = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [190]:
ser['a']

1

In [191]:
ser[['a','b','c']]

a    1
b    2
c    3
dtype: int64

In [193]:
ser[0:2]

a    1
b    2
dtype: int64

In [194]:
ser[[0,2,1]]

a    1
c    3
b    2
dtype: int64

In [195]:
ser > 3

a    False
b    False
c    False
d     True
e     True
dtype: bool

In [197]:
data = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
ser = pd.Series(data)
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [198]:
ser.isnull()

a    False
b    False
c    False
d    False
e    False
dtype: bool

In [199]:
ser1 = pd.Series(data = [1,2,3,4,5], index = ['a','b','c','d','e'])
ser2 = pd.Series(data = [1,2,3,4,5], index = ['b','a','d','c','e'])

In [200]:
ser1 + ser2

a     3
b     3
c     7
d     7
e    10
dtype: int64

In [214]:
ser.name

In [209]:
ser1.index.name

In [215]:
ser.dtype

dtype('int64')

## 2.2 DataFrame

In [216]:
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002,2003],
        'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [218]:
type(df)

pandas.core.frame.DataFrame

In [219]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [221]:
df.columns

Index(['state', 'year', 'pop'], dtype='object')

In [220]:
df.values

array([['Ohio', 2000, 1.5],
       ['Ohio', 2001, 1.7],
       ['Ohio', 2002, 3.6],
       ['Nevada', 2001, 2.4],
       ['Nevada', 2002, 2.9],
       ['Nevada', 2003, 3.2]], dtype=object)

In [222]:
df['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [223]:
df.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [224]:
df['debt'] = np.arange(6)
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4
5,Nevada,2003,3.2,5


In [225]:
df.T

Unnamed: 0,0,1,2,3,4,5
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,0,1,2,3,4,5


In [226]:
df.T.index

Index(['state', 'year', 'pop', 'debt'], dtype='object')

In [227]:
df.T.columns

RangeIndex(start=0, stop=6, step=1)

In [229]:
df.T.values

array([['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
       [2000, 2001, 2002, 2001, 2002, 2003],
       [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
       [0, 1, 2, 3, 4, 5]], dtype=object)

In [231]:
df.index.name = 'year'
df.columns.name = 'info'
df

info,state,year,pop,debt
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4
5,Nevada,2003,3.2,5


In [235]:
df.values.shape

(6, 4)

## 2.3 Essentials

### Indexing

In [237]:
ser = pd.Series(range(3), index = ['a','b','c'])
ser

a    0
b    1
c    2
dtype: int64

In [238]:
ser.index

Index(['a', 'b', 'c'], dtype='object')

In [239]:
pd.Index(np.arange(3))

Int64Index([0, 1, 2], dtype='int64')

In [240]:
pd.Index(['a','a','a','b','c'])

Index(['a', 'a', 'a', 'b', 'c'], dtype='object')

### Reindexing

In [241]:
ser = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [242]:
ser.reindex(['e','d','c','b','a'])

e    5
d    4
c    3
b    2
a    1
dtype: int64

In [244]:
ser = pd.Series(['a','b','c'], index = [0,2,4])
ser

0    a
2    b
4    c
dtype: object

In [245]:
ser.reindex(range(6))

0      a
1    NaN
2      b
3    NaN
4      c
5    NaN
dtype: object

In [246]:
ser.reindex(range(6), method = 'ffill')

0    a
1    a
2    b
3    b
4    c
5    c
dtype: object

In [247]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'California'])
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [248]:
df.reindex(index = ['d','c','a'])

Unnamed: 0,Ohio,Texas,California
d,6,7,8
c,3,4,5
a,0,1,2


In [249]:
df.reindex(columns = ['Ohio','California','a','b'])

Unnamed: 0,Ohio,California,a,b
a,0,2,,
c,3,5,,
d,6,8,,


### Duplicate indexing 

In [315]:
ser = pd.Series(range(5), index = ['a','a','b','b','c'])
ser

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [316]:
ser.index.is_unique

False

In [317]:
ser['a']

a    0
a    1
dtype: int64

In [318]:
df = pd.DataFrame(np.arange(12).reshape(4,3),
                  index = ['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,0,1,2
a,3,4,5
b,6,7,8
b,9,10,11


In [319]:
df.loc['b']

Unnamed: 0,0,1,2
b,6,7,8
b,9,10,11


### Dropping

In [250]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index = ['a', 'b','c','d'],
                  columns = ['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [251]:
df.drop(['a','c'], axis = 0)

Unnamed: 0,A,B,C,D
b,4,5,6,7
d,12,13,14,15


In [252]:
df.drop(['A','D'], axis = 1)

Unnamed: 0,B,C
a,1,2
b,5,6
c,9,10
d,13,14


### Selecting and filtering

In [253]:
ser = pd.Series(np.arange(4), index = ['a','b','c','d'])
ser

a    0
b    1
c    2
d    3
dtype: int64

In [254]:
ser['b']

1

In [257]:
ser[1]

1

In [258]:
ser[['a','b']]

a    0
b    1
dtype: int64

In [259]:
ser[ser < 2]

a    0
b    1
dtype: int64

In [261]:
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                  index = ['a','b','c','d'],
                  columns = ['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [262]:
df[['A','D']]

Unnamed: 0,A,D
a,0,3
b,4,7
c,8,11
d,12,15


In [264]:
df[:2]

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7


In [265]:
df[df['C'] > 5]

Unnamed: 0,A,B,C,D
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [267]:
df.iloc[[2,1],[3,0,1]]

Unnamed: 0,D,A,B
c,11,8,9
b,7,4,5


In [268]:
df.loc[['c','b'],['D','A','B']]

Unnamed: 0,D,A,B
c,11,8,9
b,7,4,5


### Arithmetic

In [271]:
ser1 = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
ser2 = pd.Series([5,4,3,2,1], index = ['b','d','e','f','g'])

In [272]:
ser1 + ser2

a    NaN
b    7.0
c    NaN
d    8.0
e    8.0
f    NaN
g    NaN
dtype: float64

In [276]:
df1 = pd.DataFrame(np.arange(9).reshape((3,3)),
                   columns = ['A','B','C'],
                   index = ['a','c','d'])
df2 = pd.DataFrame(np.arange(12).reshape((4,3)),
                   columns = ['C','B','E'],
                   index = ['a','e','f','c'])

In [277]:
df1 + df2

Unnamed: 0,A,B,C,E
a,,2.0,2.0,
c,,14.0,14.0,
d,,,,
e,,,,
f,,,,


In [278]:
df1.add(df2, fill_value = 0)

Unnamed: 0,A,B,C,E
a,0.0,2.0,2.0,2.0
c,3.0,14.0,14.0,11.0
d,6.0,7.0,8.0,
e,,4.0,3.0,5.0
f,,7.0,6.0,8.0


In [279]:
df1.rdiv(1)

Unnamed: 0,A,B,C
a,inf,1.0,0.5
c,0.333333,0.25,0.2
d,0.166667,0.142857,0.125


In [282]:
df = pd.DataFrame(np.arange(12).reshape((4,3)),
                  columns = ['A','B','C'],
                  index = ['a','b','c','d'])
df

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


In [283]:
ser = df.iloc[0]
ser

A    0
B    1
C    2
Name: a, dtype: int64

In [284]:
df - ser

Unnamed: 0,A,B,C
a,0,0,0
b,3,3,3
c,6,6,6
d,9,9,9


### Functions applying and mapping

In [285]:
df = pd.DataFrame(np.arange(12).reshape(4,3),
                  columns = ['A','B','C'],
                  index = ['a','b','c','d'])
df

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


In [287]:
np.sqrt(df)

Unnamed: 0,A,B,C
a,0.0,1.0,1.414214
b,1.732051,2.0,2.236068
c,2.44949,2.645751,2.828427
d,3.0,3.162278,3.316625


In [289]:
df.apply(lambda x: x.max() - x.min(), axis = 0)

A    9
B    9
C    9
dtype: int64

In [290]:
df.apply(lambda x: x.max() - x.min(), axis = 1)

a    2
b    2
c    2
d    2
dtype: int64

In [291]:
df.applymap(lambda x: '%.2f' % x)

Unnamed: 0,A,B,C
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0


In [297]:
df.applymap(lambda x: x + 1)

Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6
c,7,8,9
d,10,11,12


In [298]:
df['A'].map(lambda x: x + 1)

a     1
b     4
c     7
d    10
Name: A, dtype: int64

### Sorting and ranking

In [302]:
ser = pd.Series([3,2,4,1], index = ['d','a','b','c'])
ser

d    3
a    2
b    4
c    1
dtype: int64

In [303]:
ser.sort_index()

a    2
b    4
c    1
d    3
dtype: int64

In [304]:
ser.sort_values()

c    1
a    2
d    3
b    4
dtype: int64

In [305]:
df = pd.DataFrame(np.arange(8).reshape(2,4),
                  index = ['B','A'],
                  columns = ['d','a','b','c'])
df

Unnamed: 0,d,a,b,c
B,0,1,2,3
A,4,5,6,7


In [307]:
df.sort_index(axis = 0, ascending = True)

Unnamed: 0,d,a,b,c
A,4,5,6,7
B,0,1,2,3


In [309]:
df.sort_values(axis = 0, by = 'b')

Unnamed: 0,d,a,b,c
B,0,1,2,3
A,4,5,6,7


In [314]:
df.rank(axis = 1, method = 'max', ascending = False)

Unnamed: 0,d,a,b,c
B,4.0,3.0,2.0,1.0
A,4.0,3.0,2.0,1.0


## 2.4 Summarizing

### Statistics

In [320]:
df = pd.DataFrame([[1.4, np.nan],[7.1, -4.5],
                  [np.nan, np.nan],[0.75, -1.3]],
                  index = ['a','b','c','d'],
                  columns = ['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [325]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [327]:
df.count(axis = 0)

one    3
two    2
dtype: int64

In [321]:
df.sum(axis = 0)

one    9.25
two   -5.80
dtype: float64

In [322]:
df.mean(axis = 1, skipna = True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [323]:
df.idxmax()

one    b
two    d
dtype: object

In [324]:
df.cumsum(axis = 0)

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


### Correlation and covariance

In [334]:
df = pd.DataFrame(np.random.randn(32).reshape(8,4),
                  columns = list('ABCD'),
                  index = list('abcdefgh'))
df

Unnamed: 0,A,B,C,D
a,1.624345,-0.611756,-0.528172,-1.072969
b,0.865408,-2.301539,1.744812,-0.761207
c,0.319039,-0.24937,1.462108,-2.060141
d,-0.322417,-0.384054,1.133769,-1.099891
e,-0.172428,-0.877858,0.042214,0.582815
f,-1.100619,1.144724,0.901591,0.502494
g,0.900856,-0.683728,-0.12289,-0.935769
h,-0.267888,0.530355,-0.691661,-0.396754


In [335]:
df['A'].corr(df['B'])

-0.6488595656420935

In [336]:
df.corr()

Unnamed: 0,A,B,C,D
A,1.0,-0.64886,-0.190209,-0.540899
B,-0.64886,1.0,-0.314054,0.254604
C,-0.190209,-0.314054,1.0,-0.274059
D,-0.540899,0.254604,-0.274059,1.0


In [337]:
df.cov()

Unnamed: 0,A,B,C,D
A,0.755924,-0.573631,-0.154536,-0.412125
B,-0.573631,1.033917,-0.298406,0.226872
C,-0.154536,-0.298406,0.873211,-0.224428
D,-0.412125,0.226872,-0.224428,0.767975


In [338]:
df.corrwith(df['A'])

A    1.000000
B   -0.648860
C   -0.190209
D   -0.540899
dtype: float64

### Values and counts

In [339]:
ser = pd.Series(['c','a','d','a','a','b','b','c','c'])
ser

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [340]:
ser.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [341]:
ser.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [342]:
ser.isin(['b','c'])

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [343]:
ser[ser.isin(['b','c'])]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [345]:
df = pd.DataFrame({'A':[1,3,4,3,4],
                   'B':[2,3,1,2,3],
                   'C':[1,5,2,4,4]})
df

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [350]:
df.apply(pd.value_counts, axis = 0).fillna(0)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
