In [1]:
import numpy as np
import pandas as pd

# Descriptive Statistics

In [2]:
df = pd.DataFrame(np.random.rand(40).reshape(10,4))
print(df)

          0         1         2         3
0  0.023504  0.458210  0.379477  0.567448
1  0.645166  0.689223  0.173825  0.730713
2  0.015043  0.618500  0.410606  0.214538
3  0.311455  0.704279  0.255054  0.507500
4  0.005790  0.666388  0.343706  0.203395
5  0.256546  0.621461  0.334059  0.099987
6  0.425283  0.507761  0.772346  0.771321
7  0.708238  0.255562  0.823096  0.346173
8  0.525902  0.429840  0.514466  0.169147
9  0.275875  0.056503  0.162331  0.696008


In [3]:
df.describe() 

Unnamed: 0,0,1,2,3
count,10.0,10.0,10.0,10.0
mean,0.31928,0.500773,0.416897,0.430623
std,0.257255,0.210074,0.227206,0.254806
min,0.00579,0.056503,0.162331,0.099987
25%,0.081765,0.436933,0.274805,0.206181
50%,0.293665,0.563131,0.361592,0.426837
75%,0.500748,0.655157,0.488501,0.663868
max,0.708238,0.704279,0.823096,0.771321


In [4]:
df2 = pd.DataFrame([[1,'one', True],[2,'two',False],
                    [3,'three',True],[4,'four',False],[5,'five', False]], 
                   columns=['numb','word','bool'])
df2

Unnamed: 0,numb,word,bool
0,1,one,True
1,2,two,False
2,3,three,True
3,4,four,False
4,5,five,False


In [5]:
df2.describe()

Unnamed: 0,numb
count,5.0
mean,3.0
std,1.581139
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [6]:
df2[['word','bool']].describe()

Unnamed: 0,word,bool
count,5,5
unique,5,2
top,one,False
freq,1,3


In [7]:
df2.describe(include=['int'])

Unnamed: 0,numb
count,5.0
mean,3.0
std,1.581139
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [8]:
df.mean(0) #df.mean(axis=0) #df.mean() # columns

0    0.319280
1    0.500773
2    0.416897
3    0.430623
dtype: float64

In [9]:
df.mean(1) #df.mean(axis=1)

0    0.357160
1    0.559732
2    0.314672
3    0.444572
4    0.304820
5    0.328013
6    0.619178
7    0.533267
8    0.409839
9    0.297679
dtype: float64

In [10]:
df.std()

0    0.257255
1    0.210074
2    0.227206
3    0.254806
dtype: float64

In [11]:
std = ( df - df.mean())/ df.std()
print(std.mean())
print(std.std())

0    1.387779e-17
1   -8.881784e-17
2   -1.554312e-16
3   -3.330669e-16
dtype: float64
0    1.0
1    1.0
2    1.0
3    1.0
dtype: float64


# Trasposition, Sorting and Reindexing

In [12]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], 
                     index=['a','b','c'], 
                     columns=['A','B','C'])
print(df)

   A  B  C
a  1  2  3
b  4  5  6
c  7  8  9


In [13]:
print(df.T)

   a  b  c
A  1  4  7
B  2  5  8
C  3  6  9


In [14]:
df = pd.DataFrame(np.random.randint(10, size=16).reshape(4,4), 
                     index=['b','d','c','a'], 
                     columns=['C','D','A','B'])
print(df)

   C  D  A  B
b  1  1  4  2
d  3  8  6  1
c  9  4  4  3
a  2  8  8  6


In [15]:
df2 = df.sort_index()
print(df2)

   C  D  A  B
a  2  8  8  6
b  1  1  4  2
c  9  4  4  3
d  3  8  6  1


In [16]:
df2 = df.sort_index(ascending=False)
print(df2)

   C  D  A  B
d  3  8  6  1
c  9  4  4  3
b  1  1  4  2
a  2  8  8  6


In [17]:
df2 = df.sort_index(axis=1)
print(df2)

   A  B  C  D
b  4  2  1  1
d  6  1  3  8
c  4  3  9  4
a  8  6  2  8


In [18]:
df2 = df.sort_index(axis=1, ascending=False)
print(df2)

   D  C  B  A
b  1  1  2  4
d  8  3  1  6
c  4  9  3  4
a  8  2  6  8


In [19]:
df2 = df.sort_index(axis=0).sort_index(axis=1)
print(df2)

   A  B  C  D
a  8  6  2  8
b  4  2  1  1
c  4  3  9  4
d  6  1  3  8


In [20]:
df2 = df.sort_values(by='A')
print(df2)

   C  D  A  B
b  1  1  4  2
c  9  4  4  3
d  3  8  6  1
a  2  8  8  6


In [21]:
df['A'] = [0,0,2,2]
df2 = df.sort_values(by=['A','B'])
print(df2)

   C  D  A  B
d  3  8  0  1
b  1  1  0  2
c  9  4  2  3
a  2  8  2  6


In [22]:
df2 = df.sort_index(axis=0)
df2.loc['a','A'] = 0
print(df)
print(df2)

   C  D  A  B
b  1  1  0  2
d  3  8  0  1
c  9  4  2  3
a  2  8  2  6
   C  D  A  B
a  2  8  0  6
b  1  1  0  2
c  9  4  2  3
d  3  8  0  1


In [23]:
df = df.sort_values(by='A')
print(df)

   C  D  A  B
b  1  1  0  2
d  3  8  0  1
c  9  4  2  3
a  2  8  2  6


# Reindexing

In [24]:
df = pd.DataFrame(np.random.randint(10, size=16).reshape(4,4), 
                     index=['b','d','c','a'], 
                     columns=['C','D','A','B'])
print(df)

   C  D  A  B
b  8  9  3  8
d  6  1  6  4
c  1  7  6  1
a  4  9  3  8


In [25]:
df2 = df.reindex(['a','b','c','d'],axis=0)
print(df2)

   C  D  A  B
a  4  9  3  8
b  8  9  3  8
c  1  7  6  1
d  6  1  6  4


In [26]:
df2.iloc[1,1] = 0
print(df2)
print(df)

   C  D  A  B
a  4  9  3  8
b  8  0  3  8
c  1  7  6  1
d  6  1  6  4
   C  D  A  B
b  8  9  3  8
d  6  1  6  4
c  1  7  6  1
a  4  9  3  8


In [27]:
df2 = df.reindex(['A','B','C','D'],axis=1)
print(df2)

   A  B  C  D
b  3  8  8  9
d  6  4  6  1
c  6  1  1  7
a  3  8  4  9


In [28]:
df2 = df.reindex(['A','E','C','D',],axis=1)
print(df2)

   A   E  C  D
b  3 NaN  8  9
d  6 NaN  6  1
c  6 NaN  1  7
a  3 NaN  4  9


In [29]:
dfo = pd.DataFrame(np.zeros(16).reshape(4,4), 
                     index=['a','b','c','d'], 
                     columns=['A','B','C','D'])
print(dfo)

     A    B    C    D
a  0.0  0.0  0.0  0.0
b  0.0  0.0  0.0  0.0
c  0.0  0.0  0.0  0.0
d  0.0  0.0  0.0  0.0


In [30]:
df2 = df.reindex_like(dfo)
print(df2)

   A  B  C  D
a  3  8  4  9
b  3  8  8  9
c  6  1  1  7
d  6  4  6  1


In [31]:
dfo = pd.DataFrame(np.zeros(16).reshape(4,4), 
                     index=['a','b','c','e'], 
                     columns=['A','B','W','Z'])
print(dfo)

     A    B    W    Z
a  0.0  0.0  0.0  0.0
b  0.0  0.0  0.0  0.0
c  0.0  0.0  0.0  0.0
e  0.0  0.0  0.0  0.0


In [32]:
df2 = df.reindex_like(dfo)
print(df2)

     A    B   W   Z
a  3.0  8.0 NaN NaN
b  3.0  8.0 NaN NaN
c  6.0  1.0 NaN NaN
e  NaN  NaN NaN NaN
